func wrapText(nodes []*html.Node) []*html.Node { wrapped := make([]*html.Node, 0, len(nodes)) var wrapper *html.Node appendWrapper := func() { if wrapper != nil { // render and re-parse so p-inline-p expands wrapped = append(wrapped, ParseDepth(Render(wrapper), 0)...) wrapper = nil } } for _, n := range nodes { if n.Type == html.ElementNode && isBlockElement[n.DataAtom] { appendWrapper() wrapped = append(wrapped, n) continue } if wrapper == nil && n.Type == html.TextNode && strings.TrimSpace(n.Data) == "" { wrapped = append(wrapped, n) continue } if wrapper == nil { wrapper = &html.Node{ Type: html.ElementNode, Data: "p", DataAtom: atom.P, } } wrapper.AppendChild(n) } appendWrapper() return wrapped }
func img2Link(img *html.Node) { if img.Data == "img" { img.Data = "a" for i := 0; i < len(img.Attr); i++ { if img.Attr[i].Key == "src" { img.Attr[i].Key = "href" } } double := closureTextNodeExists(img) imgContent := "" title := attrX(img.Attr, "title") if double { imgContent = fmt.Sprintf("[img] %v %v | ", "[ctdr]", // content title double removed urlBeautify(attrX(img.Attr, "href"))) } else { imgContent = fmt.Sprintf("[img] %v %v | ", title, urlBeautify(attrX(img.Attr, "href"))) } img.Attr = attrSet(img.Attr, "cfrom", "img") nd := dom.Nd("text", imgContent) img.AppendChild(nd) } }
func runMergeNodes(parent, prev, next *html.Node, addSeparator bool) *html.Node { var u parserUtils if prev != nil { parent.AppendChild(prev) } if next != nil { parent.AppendChild(next) } return u.mergeNodes(parent, prev, next, addSeparator) }
// Replace the given node's children with the given string. func setNodeText(node *html.Node, s string) { // remove all existing children for node.FirstChild != nil { node.RemoveChild(node.FirstChild) } // add the text node.AppendChild(&html.Node{ Type: html.TextNode, Data: s, }) }
// CloneNode makes a copy of a Node with all descendants. func CloneNode(n *exphtml.Node) *exphtml.Node { clone := new(exphtml.Node) clone.Type = n.Type clone.DataAtom = n.DataAtom clone.Data = n.Data clone.Attr = make([]exphtml.Attribute, len(n.Attr)) copy(clone.Attr, n.Attr) for c := n.FirstChild; c != nil; c = c.NextSibling { nc := CloneNode(c) clone.AppendChild(nc) } return clone }
func (u *parserUtils) addChildTextNodeToBegining(node *html.Node, text string) { if node.FirstChild != nil && node.FirstChild.Type == html.TextNode { node.FirstChild.Data = text + node.FirstChild.Data } else { newNode := &html.Node{ Type: html.TextNode, Data: text} if node.FirstChild == nil { node.AppendChild(newNode) } else { node.InsertBefore(newNode, node.FirstChild) } } }
// append1 actually appends to the merged HTML node tree. func (ap *appendContext) append1(action rune, text string, proto *html.Node, pos posT) { if proto == nil { return } appendPoint, protoAncestor := ap.lastMatchingLeaf(proto, action, pos) if appendPoint == nil || protoAncestor == nil { return } if appendPoint.DataAtom != protoAncestor.DataAtom { return } newLeaf := new(html.Node) copyNode(newLeaf, proto) if proto.Type == html.TextNode { newLeaf.Data = text } if action != '=' { insertNode := &html.Node{ Type: html.ElementNode, DataAtom: atom.Span, Data: "span", } switch action { case '+': insertNode.Attr = convertAttributes(ap.c.InsertedSpan) case '-': insertNode.Attr = convertAttributes(ap.c.DeletedSpan) case '~': insertNode.Attr = convertAttributes(ap.c.ReplacedSpan) } insertNode.AppendChild(newLeaf) newLeaf = insertNode } for proto = proto.Parent; proto != nil && proto != protoAncestor; proto = proto.Parent { above := new(html.Node) copyNode(above, proto) above.AppendChild(newLeaf) newLeaf = above } appendPoint.AppendChild(newLeaf) }
func mergeHtml(folder VirtualFolder, names []string) []byte { var result *html.Node = nil var body *html.Node = nil for _, name := range names { f, e := folder.OpenFile(name) if e != nil { logger.Fatalf("error reading '%s'.\n", name) } doc, e := html.Parse(f) f.Close() if e != nil { logger.Fatalf("error parsing '%s'.\n", name) } b := findFirstChild(doc, atom.Body) if b == nil { logger.Fatalf("'%s' has no 'body' element.\n", name) } if body == nil { result = doc body = b continue } for n := b.FirstChild; n != nil; n = b.FirstChild { b.RemoveChild(n) body.AppendChild(n) } } buf := new(bytes.Buffer) if e := html.Render(buf, result); e != nil { logger.Fatalf("failed render result for '%s'.\n", folder.Name()) } return buf.Bytes() }
func cleanChildren(c *Config, parent *html.Node) { var children []*html.Node for parent.FirstChild != nil { child := parent.FirstChild parent.RemoveChild(child) children = append(children, filterNode(c, child)) } if c.WrapText { _, ok := c.wrap[parent.DataAtom] if !ok && parent.DataAtom == 0 { _, ok = c.wrapCustom[parent.Data] } if ok { children = wrapText(children) } } for _, child := range children { parent.AppendChild(child) } }
func textifyNodeSubtree(n *html.Node) { if n.Type == html.ElementNode { nd := dom.Nd("text") nd.Data = textifySubtreeBruteForce(n, 0) nd.Data = stringspb.NormalizeInnerWhitespace(nd.Data) cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { n.RemoveChild(c) } n.AppendChild(nd) nd2 := dom.Nd("br") dom.InsertAfter(n, nd2) } }
func helperRemoveNode(parent, prev, next *html.Node, isSeparator bool) { removed := &html.Node{} parent.AppendChild(prev) parent.AppendChild(removed) parent.AppendChild(next) var u parserUtils result, err := u.removeNode(removed, isSeparator) So(err, ShouldBeNil) So(result, ShouldEqual, nil) }
// CreationDate returns the time an HTML document was created. // // It also returns a FileInfo for the document, with the time added in the // header if it was missing. The bool returned is true the meta creation // element has been added to the header. func CreationDate(path string) (*FileInfo, bool, error) { title := "" f, err := os.Open(path) if err != nil { return nil, false, err } defer f.Close() stat, err := f.Stat() if err != nil { return nil, false, err } doc, err := html.Parse(f) if err != nil { return nil, false, err } hasMeta := false var head *html.Node var found func(*html.Node) var created time.Time found = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "head" { head = n } if n.Type == html.ElementNode && n.Data == "title" { for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.TextNode { title = title + c.Data } } } if n.Type == html.ElementNode && n.Data == "meta" { name, err := getAttrByName(n, "name") if err == nil { value, err := getAttrByName(n, "value") if err == nil && name == "created" { created, err = time.Parse(format, value) if err != nil { created, err = time.Parse(format_no_tz, value) if err == nil { hasMeta = true } } else { hasMeta = true } } } } for c := n.FirstChild; c != nil; c = c.NextSibling { found(c) } } found(doc) if !hasMeta { now := time.Now() meta := &html.Node{ Type: html.ElementNode, Data: "meta", Attr: []html.Attribute{ {Key: "value", Val: now.Format(format)}, {Key: "name", Val: "created"}, }} head.AppendChild(meta) created = now } fi := &FileInfo{ Path: path, Node: doc, Title: title, Created: created, Updated: stat.ModTime(), } return fi, !hasMeta, nil }
func breakoutImagesFromAnchorTrees(n *html.Node) { for c := n.FirstChild; c != nil; c = c.NextSibling { breakoutImagesFromAnchorTrees(c) } if n.Type == html.ElementNode && n.Data == "a" { img, lvl := searchImg(n, nil, 0) if img != nil { only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild if lvl == 1 && only1Child { // log.Printf("only child image lvl %v a\n", lvl) n.RemoveChild(img) n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end" contnt := urlBeautify(attrX(n.Attr, "href")) if len(contnt) < 6 { contnt = "[was img] " + contnt } n.AppendChild(dom.Nd("text", contnt)) } else { if debugBreakOut { b0 := dom.PrintSubtree(n) log.Printf("\n%s\n", b0) } // log.Printf(" got it %v\n", img.Data) a1 := dom.CloneNodeWithSubtree(n) fc1 := closureDeleter(true) fc1(n, 0, false) if debugBreakOut { b1 := dom.PrintSubtree(n) log.Printf("\n%s\n", b1) } fc2 := closureDeleter(false) fc2(a1, 0, false) if debugBreakOut { b2 := dom.PrintSubtree(a1) log.Printf("\n%s\n", b2) log.Printf("--------------------\n") } if true { n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end" n.Parent.InsertBefore(a1, img.NextSibling) } else { // old way ; sequence corrpution if n had rightwise siblings. n.Parent.AppendChild(img) n.Parent.AppendChild(a1) } } // changing image to link later } else { // log.Printf("no img in a\n") } } }
func condenseBottomUpV2(n *html.Node, lvl, lvlDo int, types map[string]bool) { if lvl < lvlDo { cs := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cs = append(cs, c) } for _, c := range cs { condenseBottomUpV2(c, lvl+1, lvlDo, types) } } else { // log.Printf("action on %v %v\n", lvl, lvlDo) switch { case n.Type == html.ElementNode && types[n.Data]: oldPar := n.Parent if oldPar == nil { return } b, newPar := flattenSubtreeV2(n, nil, 0, nil) // placeholder := dom.Nd("div") // par := n.Parent // par.InsertBefore(placeholder, n.NextSibling) // par.RemoveChild(n) // par.InsertBefore(n2, placeholder) for c := oldPar.FirstChild; c != nil; c = c.NextSibling { oldPar.RemoveChild(c) } for c := newPar.FirstChild; c != nil; c = c.NextSibling { newPar.RemoveChild(c) oldPar.AppendChild(c) } if lvlDo > 4 { bx := dom.PrintSubtree(newPar) fmt.Printf("%s", bx) } // n = n2 nodeRepl := dom.Nd("text", b.String()) if false { // Remove all existing children. // Direct loop impossible, since "NextSibling" is set to nil by Remove(). children := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { children = append(children, c) // assembling separately, before removing. } for _, c := range children { log.Printf("c %4v rem from %4v ", c.Data, n.Data) n.RemoveChild(c) } // we can't put our replacement "under" an image, since img cannot have children if n.Type == html.ElementNode && n.Data == "img" { n.Parent.InsertBefore(nodeRepl, n.NextSibling) // if n.NextSibling==nil => insert at the end n.Parent.RemoveChild(n) } else { n.AppendChild(nodeRepl) } // Insert a || and a newline before every <a...> // if n.Data == "a" { // n.Parent.InsertBefore(dom.Nd("text", " || "), n) // } } default: } } }
func makeHtml(title_n *html.Node, body_n *html.Node) *html.Node { if title_n == nil { // make manually title node title_n = new(html.Node) *title_n = html.Node{ Parent: nil, FirstChild: nil, LastChild: nil, PrevSibling: nil, NextSibling: nil, Type: html.ElementNode, DataAtom: atom.Title, Data: "title", Attr: []html.Attribute{}, } title_text := new(html.Node) *title_text = html.Node{ Parent: nil, FirstChild: nil, LastChild: nil, PrevSibling: nil, NextSibling: nil, Type: html.TextNode, DataAtom: 0, Data: "Empty title", Attr: []html.Attribute{}, } title_n.AppendChild(title_text) } else { // clear tag from parametrs title_n.Attr = []html.Attribute{} // remove parents for correct work title_n.Parent.RemoveChild(title_n) } if body_n == nil { // make manually body node body_n = new(html.Node) *body_n = html.Node{ Parent: nil, FirstChild: nil, LastChild: nil, PrevSibling: nil, NextSibling: nil, Type: html.ElementNode, DataAtom: atom.Body, Data: "body", Attr: []html.Attribute{}, } body_text := new(html.Node) *body_text = html.Node{ Parent: nil, FirstChild: nil, LastChild: nil, PrevSibling: nil, NextSibling: nil, Type: html.TextNode, DataAtom: 0, Data: "Empty body", Attr: []html.Attribute{}, } body_n.AppendChild(body_text) } else { body_n.Attr = []html.Attribute{} body_n.Parent.RemoveChild(body_n) } model := "<html><head><meta charset=\"utf-8\"></head></html>" output, _ := html.Parse(strings.NewReader(model)) htmlnode := output.FirstChild headnode := htmlnode.FirstChild defbodynode := headnode.NextSibling output.FirstChild.RemoveChild(defbodynode) // delete empty <body> tag headnode.AppendChild(title_n) htmlnode.AppendChild(body_n) return output }
func AppendChildNodes(parent *html.Node, children []*html.Node) { for _, c := range children { parent.AppendChild(c) } }
func flattenSubtreeV2(n *html.Node, b *bytes.Buffer, depth int, tpar *html.Node) (*bytes.Buffer, *html.Node) { if b == nil { b = new(bytes.Buffer) } if tpar == nil { tpar = &html.Node{ Type: n.Type, DataAtom: n.DataAtom, Data: n.Data, Attr: make([]html.Attribute, len(n.Attr)), } copy(tpar.Attr, n.Attr) } switch { case n.Type == html.ElementNode && n.Data == "a": n.Parent.RemoveChild(n) tpar.AppendChild(n) // wpf(b, "[a] ") case n.Type == html.ElementNode && n.Data == "img": // img2Link(n) n.Parent.RemoveChild(n) tpar.AppendChild(n) case n.Data == "em" || n.Data == "strong": wpf(b, "[%v l%v] ", n.Data, depth) n.Parent.RemoveChild(n) tpar.AppendChild(n) case n.Data == "label" || n.Data == "input" || n.Data == "textarea": n.Parent.RemoveChild(n) tpar.AppendChild(n) case n.Data == "p" || n.Data == "div" || n.Data == "li" || n.Data == "ol" || n.Data == "h1" || n.Data == "h2" || n.Data == "ul": n.Parent.RemoveChild(n) tpar.AppendChild(n) case n.Data == "span": for c := n.FirstChild; c != nil; c = c.NextSibling { n.RemoveChild(c) tpar.AppendChild(c) } n.Parent.RemoveChild(n) case n.Type == html.TextNode && n.Data != "": n.Data = strings.TrimSpace(n.Data) n.Data += " " wpf(b, n.Data) n.Parent.RemoveChild(n) tpar.AppendChild(n) default: log.Printf("unhandled %s %s\n", dom.NodeTypeStr(n.Type), n.Data) n.Parent.RemoveChild(n) } // // children := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { // fmt.Printf("still has children %v\n", c.Data) children = append(children, c) // assembling separately, before removing. } for _, c := range children { flattenSubtreeV2(c, b, depth+1, tpar) } return b, tpar }
func flattenSubtreeV3Inner(n, nClone *html.Node, lvl int) { // log.Printf("fsbi\n") for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { chClone := dom.CloneNode(ch) switch { case ch.Type == html.ElementNode && standard[ch.Data]: nClone.AppendChild(chClone) flattenSubtreeV3Inner(ch, chClone, lvl+1) case ch.Type == html.ElementNode && ch.Data == "a": nClone.AppendChild(chClone) flattenSubtreeV3Inner(ch, chClone, lvl+1) case ch.Type == html.ElementNode && ch.Data == "img": nClone.AppendChild(chClone) case ch.Data == "span": // log.Printf(strings.Repeat(" ", lvl) + "span \n") for cch := ch.FirstChild; cch != nil; cch = cch.NextSibling { // log.Printf(strings.Repeat(" ", lvl)+"span child %v", cch.Data) cchClone := dom.CloneNode(cch) nClone.AppendChild(cchClone) nClone.AppendChild(dom.Nd("text", " ")) flattenSubtreeV3Inner(cch, cchClone, lvl+1) } case ch.Type == html.TextNode && ch.Data != "": chClone.Data = strings.TrimSpace(chClone.Data) chClone.Data += " " nClone.AppendChild(chClone) default: // nClone.AppendChild(chClone) log.Printf("unhandled %s %s\n", dom.NodeTypeStr(ch.Type), ch.Data) } } }