func CompactNode(n *html.Node) { var appendNodes []*html.Node for c := n.FirstChild; c != nil; { CompactNode(c) if _mergeTextElements[c.Data] { appendNodes = append(appendNodes, GetChildNodes(c)...) log.Info("delete", c.Data) c = RemoveNode(c) } else if c.Type == html.ElementNode && c.FirstChild == nil && !_voidElements[c.Data] { log.Info("delete", c.Data) c = RemoveNode(c) } else { c = c.NextSibling } } DetachNodes(appendNodes) AppendChildNodes(n, appendNodes) if n.FirstChild != nil && n.FirstChild.NextSibling == nil { if n.FirstChild.Data == n.Data || (n.FirstChild.Data == "br" && (n.Data == "p" || n.Data == "div")) { childNodes := GetChildNodes(n.FirstChild) log.Info("delete", n.FirstChild.Data) n.RemoveChild(n.FirstChild) DetachNodes(childNodes) AppendChildNodes(n, childNodes) } else if n.FirstChild.Data == "img" && n.Data == "a" { *n = *n.FirstChild } } }
func (u *parserUtils) mergeNodes(parent, prev, next *html.Node, addSeparator bool) *html.Node { prevText := prev != nil && prev.Type == html.TextNode nextText := next != nil && next.Type == html.TextNode delim := "" if addSeparator { delim = " " } if prevText && nextText { prev.Data = prev.Data + delim + next.Data parent.RemoveChild(next) return prev.NextSibling } if prevText { prev.Data = prev.Data + delim } else if nextText { next.Data = delim + next.Data } else if addSeparator { newNode := &html.Node{ Type: html.TextNode, Data: delim} parent.InsertBefore(newNode, next) } return next }
/* div div div p p TO img img p p Operates from the *middle* div. Saves all children in inverted slice. Removes each child and reattaches it one level higher. Finally the intermediary, now childless div is removed. \ / \ /\ / \_____/ \_____/ \ / \_____/\_____/ \__________/ => Breaks are gone \p1___p2___/ => Wrapping preserves breaks */ func topDownV1(n *html.Node, couple []string, parentType string) { if noParent(n) { return } p := n.Parent parDiv := p.Type == html.ElementNode && p.Data == couple[0] // Parent is a div iAmDiv := n.Type == html.ElementNode && n.Data == couple[1] // I am a div noSiblings := n.PrevSibling == nil && n.NextSibling == nil only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild svrlChildn := n.FirstChild != nil && n.FirstChild != n.LastChild noChildren := n.FirstChild == nil _, _ = noSiblings, noChildren if parDiv && iAmDiv { if only1Child || svrlChildn { var children []*html.Node for c := n.FirstChild; c != nil; c = c.NextSibling { children = append([]*html.Node{c}, children...) // order inversion } insertionPoint := n.NextSibling for _, c1 := range children { n.RemoveChild(c1) if c1.Type == html.TextNode || c1.Data == "a" { // pf("wrapping %v\n", NodeTypeStr(c1.Type)) wrap := html.Node{Type: html.ElementNode, Data: "p", Attr: []html.Attribute{html.Attribute{Key: "cfrm", Val: "div"}}} wrap.FirstChild = c1 p.InsertBefore(&wrap, insertionPoint) c1.Parent = &wrap insertionPoint = &wrap } else { p.InsertBefore(c1, insertionPoint) insertionPoint = c1 } } p.RemoveChild(n) if p.Data != parentType { p.Data = parentType } } } }
// We want to remove some children. // A direct loop is impossible, // since "NextSibling" is set to nil during Remove(). // Therefore: // First assemble children separately. // Then remove them. func removeUnwanted(n *html.Node) { cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { if unwanteds[c.Data] { n.RemoveChild(c) } } }
func removeUnwanted(n *html.Node) { cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { if n.Type == html.ElementNode && n.Data == "script" || n.Type == html.CommentNode { n.RemoveChild(c) } } }
// Replace the given node's children with the given string. func setNodeText(node *html.Node, s string) { // remove all existing children for node.FirstChild != nil { node.RemoveChild(node.FirstChild) } // add the text node.AppendChild(&html.Node{ Type: html.TextNode, Data: s, }) }
func replaceNodeWithChildren(n *html.Node) { var next *html.Node parent := n.Parent for c := n.FirstChild; c != nil; c = next { next = c.NextSibling n.RemoveChild(c) parent.InsertBefore(c, n) } parent.RemoveChild(n) }
func removeEmptyNodes(n *html.Node, lvl int) { // children cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { removeEmptyNodes(c, lvl+1) } // processing // empty element nodes if n.Type == html.ElementNode && n.Data == "img" { src := attrX(n.Attr, "src") if src == "" { n.Parent.RemoveChild(n) } } if n.Type == html.ElementNode && n.FirstChild == nil && n.Data == "a" { href := attrX(n.Attr, "href") if href == "#" || href == "" { n.Parent.RemoveChild(n) } } if n.Type == html.ElementNode && n.FirstChild == nil && (n.Data == "em" || n.Data == "strong") { n.Parent.RemoveChild(n) } if n.Type == html.ElementNode && n.FirstChild == nil && (n.Data == "div" || n.Data == "span" || n.Data == "li" || n.Data == "p") { n.Parent.RemoveChild(n) } // spans with less than 2 characters inside => flatten to text only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild if n.Type == html.ElementNode && n.Data == "span" && only1Child && n.FirstChild.Type == html.TextNode && len(strings.TrimSpace(n.FirstChild.Data)) < 3 { n.Type = html.TextNode n.Data = n.FirstChild.Data n.RemoveChild(n.FirstChild) } }
// clean normalises styles/colspan and removes any CleanTags specified, along with newlines; // but also makes all the character handling (for example " " as utf-8) the same. // It returns the estimated number of treeRunes that will be used. // TODO more cleaning of the input HTML, as required. func (c *Config) clean(n *html.Node) int { size := 1 switch n.Type { case html.ElementNode: for ai := 0; ai < len(n.Attr); ai++ { a := n.Attr[ai] switch { case strings.ToLower(a.Key) == "style": if strings.TrimSpace(a.Val) == "" { // delete empty styles n.Attr = delAttr(n.Attr, ai) ai-- } else { // tidy non-empty styles // TODO there could be more here to make sure the style entries are in the same order etc. n.Attr[ai].Val = strings.Replace(a.Val, " ", "", -1) if !strings.HasSuffix(n.Attr[ai].Val, ";") { n.Attr[ai].Val += ";" } } case n.DataAtom == atom.Td && strings.ToLower(a.Key) == "colspan" && strings.TrimSpace(a.Val) == "1": n.Attr = delAttr(n.Attr, ai) ai-- } } case html.TextNode: n.Data = htm.UnescapeString(n.Data) size += utf8.RuneCountInString(n.Data) - 1 // len(n.Data) would be faster, but use more memory } searchChildren: for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { switch ch.Type { case html.ElementNode: for _, rr := range c.CleanTags { if rr == ch.Data { n.RemoveChild(ch) goto searchChildren } } } size += c.clean(ch) } return size }
// sliceNode returns the two halves of the HTML tree starting at node after // splitting it at the given textual offset. func sliceNode(offsets *nodeOffsets, node *html.Node, offset int) (*html.Node, *html.Node) { origStart, origEnd := offsets.Bounds(node) if origStart > offset || origEnd < offset { log.Fatalf("sliceNode: offset %d out of node's span (%d → %d)", offset, origStart, origEnd) } n, m := copyNode(node), copyNode(node) parent := node.Parent if parent != nil { parent.InsertBefore(n, node) parent.InsertBefore(m, node) parent.RemoveChild(node) } switch node.Type { default: log.Fatalf("Unhandled node kind: %d", node.Type) case html.ElementNode: child := node.FirstChild for child != nil { next := child.NextSibling if _, end := offsets.Bounds(child); end <= offset { node.RemoveChild(child) n.AppendChild(child) } else if start, _ := offsets.Bounds(child); start > offset { node.RemoveChild(child) m.AppendChild(child) } else { left, right := sliceNode(offsets, child, offset) node.RemoveChild(left) node.RemoveChild(right) n.AppendChild(left) m.AppendChild(right) } child = next } case html.TextNode: mark := offset - origStart n.Data = node.Data[:mark] m.Data = node.Data[mark:] } if split := offsets.update(n, origStart); split != offset { log.Fatalf("split %d ≠ %d", split, offset) } if newEnd := offsets.update(m, offset); newEnd != origEnd { log.Fatalf("end %d ≠ %d", newEnd, origEnd) } return n, m }
// Minifies node and returns a minification Result. func doMinify(node *html.Node, ctx *context) result { prevWasWhitespace := false var next *html.Node rv := result{} for child := node.FirstChild; child != nil; child = next { next = child.NextSibling script := getHTMLNodeAttr(child, "script", "src") if rv.IndexHTMLBase == "" { rv.IndexHTMLBase = getHTMLNodeAttr(child, "base", "href") } switch { case strings.Contains(script, "libs/") && strings.HasSuffix(script, ".js"): minFile := script[:len(script)-3] + ".min.js" if _, err := os.Stat(filepath.Join(ctx.BaseDir, minFile)); err == nil { replaceAttrValue(child, "src", minFile) } prevWasWhitespace = false case strings.HasSuffix(script, ".js"): if !ctx.FoundFirstAppScript { ctx.FoundFirstAppScript = true node.InsertBefore(makeAppMinJsNode(), child) node.InsertBefore(makeNewLine(), child) } rv.AppScripts = append(rv.AppScripts, script) node.RemoveChild(child) case isWhitespaceText(child) && node.Type == html.ElementNode && node.Data == "head": if !prevWasWhitespace { node.InsertBefore(makeNewLine(), child) } node.RemoveChild(child) prevWasWhitespace = true default: if isPluggableUIInjectionComment(child) { rv.PluggableInjectionCount++ } else { childResult := doMinify(child, ctx) rv.merge(childResult) } prevWasWhitespace = false } } return rv }
// cleanBody removes unwanted HTML elements from the HTML body. func (doc *Document) cleanBody(n *html.Node, level int) { // removeNode returns true if a node should be removed from HTML document. removeNode := func(c *html.Node, level int) bool { return removeElements[c.DataAtom] } var curr *html.Node = n.FirstChild var next *html.Node = nil for ; curr != nil; curr = next { // We have to remember the next sibling here because calling RemoveChild // sets curr's NextSibling pointer to nil and we would quit the loop // prematurely. next = curr.NextSibling if curr.Type == html.ElementNode { if removeNode(curr, level) { n.RemoveChild(curr) } else { doc.cleanBody(curr, level+1) } } } }
func cleanChildren(c *Config, parent *html.Node) { var children []*html.Node for parent.FirstChild != nil { child := parent.FirstChild parent.RemoveChild(child) children = append(children, filterNode(c, child)) } if c.WrapText { _, ok := c.wrap[parent.DataAtom] if !ok && parent.DataAtom == 0 { _, ok = c.wrapCustom[parent.Data] } if ok { children = wrapText(children) } } for _, child := range children { parent.AppendChild(child) } }
//return true if need to delete node, false another way func deleteValuelessNodes(innode *html.Node) bool { if innode.Type == html.CommentNode { //fmt.Println("comment:" + innode.Data) return true } if innode.Type == html.ElementNode { //innode.Attr = []html.Attribute{} if innode.Data == "script" || innode.Data == "meta" || innode.Data == "style" || innode.Data == "head" || innode.Data == "form" || innode.Data == "noscript" || innode.Data == "img" || innode.Data == "noindex" || innode.Data == "span" { //fmt.Println("script: " + innode.Data) return true } } for node := innode.FirstChild; node != nil; { if deleteValuelessNodes(node) { tnode := node.NextSibling innode.RemoveChild(node) node = tnode continue } node = node.NextSibling } return false }
func textifyNodeSubtree(n *html.Node) { if n.Type == html.ElementNode { nd := dom.Nd("text") nd.Data = textifySubtreeBruteForce(n, 0) nd.Data = stringspb.NormalizeInnerWhitespace(nd.Data) cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { n.RemoveChild(c) } n.AppendChild(nd) nd2 := dom.Nd("br") dom.InsertAfter(n, nd2) } }
func flattenSubtreeV2(n *html.Node, b *bytes.Buffer, depth int, tpar *html.Node) (*bytes.Buffer, *html.Node) { if b == nil { b = new(bytes.Buffer) } if tpar == nil { tpar = &html.Node{ Type: n.Type, DataAtom: n.DataAtom, Data: n.Data, Attr: make([]html.Attribute, len(n.Attr)), } copy(tpar.Attr, n.Attr) } switch { case n.Type == html.ElementNode && n.Data == "a": n.Parent.RemoveChild(n) tpar.AppendChild(n) // wpf(b, "[a] ") case n.Type == html.ElementNode && n.Data == "img": // img2Link(n) n.Parent.RemoveChild(n) tpar.AppendChild(n) case n.Data == "em" || n.Data == "strong": wpf(b, "[%v l%v] ", n.Data, depth) n.Parent.RemoveChild(n) tpar.AppendChild(n) case n.Data == "label" || n.Data == "input" || n.Data == "textarea": n.Parent.RemoveChild(n) tpar.AppendChild(n) case n.Data == "p" || n.Data == "div" || n.Data == "li" || n.Data == "ol" || n.Data == "h1" || n.Data == "h2" || n.Data == "ul": n.Parent.RemoveChild(n) tpar.AppendChild(n) case n.Data == "span": for c := n.FirstChild; c != nil; c = c.NextSibling { n.RemoveChild(c) tpar.AppendChild(c) } n.Parent.RemoveChild(n) case n.Type == html.TextNode && n.Data != "": n.Data = strings.TrimSpace(n.Data) n.Data += " " wpf(b, n.Data) n.Parent.RemoveChild(n) tpar.AppendChild(n) default: log.Printf("unhandled %s %s\n", dom.NodeTypeStr(n.Type), n.Data) n.Parent.RemoveChild(n) } // // children := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { // fmt.Printf("still has children %v\n", c.Data) children = append(children, c) // assembling separately, before removing. } for _, c := range children { flattenSubtreeV2(c, b, depth+1, tpar) } return b, tpar }
func removeChildren(n *html.Node) { for c := n.FirstChild; c != nil; c = c.NextSibling { defer n.RemoveChild(c) } }
// Now this third implementation finally condenses *selectively*. // Not all boats from each pond are lifted equally. // We achieve tremendous structural simplification. // It also starts from top, pulling lower levels up. // Unlike implementation #1, that started from the middle. func topDownV3(l1 *html.Node, l2Types map[string]bool, l3Types map[string]bool) { if l1.Type != html.ElementNode && l1.Type != html.DocumentNode { return // cannot assign to - do not unable to have children } if l1.Data == "span" || l1.Data == "a" { return // want not condense into } // dig two levels deep // isolate l2,l3 l2s := []*html.Node{} l3s := map[*html.Node][]*html.Node{} for l2 := l1.FirstChild; l2 != nil; l2 = l2.NextSibling { l2s = append(l2s, l2) // l2s = append([]*html.Node{l2}, l2s...) // order inversion for l3 := l2.FirstChild; l3 != nil; l3 = l3.NextSibling { l3s[l2] = append(l3s[l2], l3) // l3s[l2] = append(map[*html.Node][]*html.Node{l2: []*html.Node{l3}}, l3s[l2]...) // order inversion } } postponedRemoval := map[*html.Node]bool{} // // // check types for each l2 subtree distinctively for _, l2 := range l2s { l2Match := l2.Type == html.ElementNode && l2Types[l2.Data] // l2 is a div l3Match := true for _, l3 := range l3s[l2] { l3Match = l3Match && (l3.Type == html.ElementNode && l3Types[l3.Data]) } // act if l2Match && l3Match { // detach l3 from l2 for _, l3 := range l3s[l2] { // if ml3[l3] > 0 { // fmt.Printf("rmd_%v_%v ", ml3[l3], l3.Data) // } l2.RemoveChild(l3) // ml3[l3]++ } // Since we still need l2 below // We have to postpone detaching l2 from l1 // to the bottom // NOT HERE: l1.RemoveChild(l2) postponedRemoval[l2] = true for _, l3 := range l3s[l2] { // attach l3 to l1 if l3.Data != "a" && l3.Data != "span" { l1.InsertBefore(l3, l2) } else { wrap := dom.Nd("p") wrap.Attr = []html.Attribute{html.Attribute{Key: "cfrm", Val: "noth"}} wrap.AppendChild(l3) // NOT wrap.FirstChild = l3 l1.InsertBefore(wrap, l2) } } } } for k, _ := range postponedRemoval { l1.RemoveChild(k) // detach l2 from l1 } }
func condenseBottomUpV2(n *html.Node, lvl, lvlDo int, types map[string]bool) { if lvl < lvlDo { cs := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cs = append(cs, c) } for _, c := range cs { condenseBottomUpV2(c, lvl+1, lvlDo, types) } } else { // log.Printf("action on %v %v\n", lvl, lvlDo) switch { case n.Type == html.ElementNode && types[n.Data]: oldPar := n.Parent if oldPar == nil { return } b, newPar := flattenSubtreeV2(n, nil, 0, nil) // placeholder := dom.Nd("div") // par := n.Parent // par.InsertBefore(placeholder, n.NextSibling) // par.RemoveChild(n) // par.InsertBefore(n2, placeholder) for c := oldPar.FirstChild; c != nil; c = c.NextSibling { oldPar.RemoveChild(c) } for c := newPar.FirstChild; c != nil; c = c.NextSibling { newPar.RemoveChild(c) oldPar.AppendChild(c) } if lvlDo > 4 { bx := dom.PrintSubtree(newPar) fmt.Printf("%s", bx) } // n = n2 nodeRepl := dom.Nd("text", b.String()) if false { // Remove all existing children. // Direct loop impossible, since "NextSibling" is set to nil by Remove(). children := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { children = append(children, c) // assembling separately, before removing. } for _, c := range children { log.Printf("c %4v rem from %4v ", c.Data, n.Data) n.RemoveChild(c) } // we can't put our replacement "under" an image, since img cannot have children if n.Type == html.ElementNode && n.Data == "img" { n.Parent.InsertBefore(nodeRepl, n.NextSibling) // if n.NextSibling==nil => insert at the end n.Parent.RemoveChild(n) } else { n.AppendChild(nodeRepl) } // Insert a || and a newline before every <a...> // if n.Data == "a" { // n.Parent.InsertBefore(dom.Nd("text", " || "), n) // } } default: } } }
// Condense upwards builds a three-levels subtree // starting from param node l1 // l2 and l3 nodes need to comply by type // // Then l3 is moved under l1; l2 is eliminated // // For <a> or "text" l3 nodes, we could introduce wrappers // // l2Types so far always is "div". // Multiple l2Types are possible, but difficult to imagine. // // l1 type could be changed - from div to ul for instance, but I found no use for that // // Implementation yields similar result as condenseTopDown1 // but the "all-or-nothing" logic is clearer func topDownV2(l1 *html.Node, l2Types map[string]bool, l3Types map[string]bool) { if l1.Type != html.ElementNode && l1.Type != html.DocumentNode { return // cannot assign to - do not unable to have children } if l1.Data == "span" || l1.Data == "a" { return // want not condense into } // dig two levels deeper // isolate l2 var l2s []*html.Node for l2 := l1.FirstChild; l2 != nil; l2 = l2.NextSibling { l2s = append(l2s, l2) // l2s = append([]*html.Node{l2}, l2s...) // order inversion } // measure types l2Div := true // note that *all* l3 must have l3Type, not just those those of one l2 element // otherwise we get only partial restructuring - and therefore sequence errors l3Div := true for _, l2 := range l2s { l2Div = l2Div && l2.Type == html.ElementNode && l2Types[l2.Data] // l2 is a div for l3 := l2.FirstChild; l3 != nil; l3 = l3.NextSibling { l3Div = l3Div && (l3.Type == html.ElementNode && l3Types[l3.Data]) // l3 is a div or ul or form } } // act if l2Div && l3Div { for _, l2 := range l2s { // isolate l3 var l3s []*html.Node for l3 := l2.FirstChild; l3 != nil; l3 = l3.NextSibling { l3s = append(l3s, l3) // l3s = append([]*html.Node{l3}, l3s...) // order inversion } // detach l3 from l2 for _, l3 := range l3s { l2.RemoveChild(l3) } l1.RemoveChild(l2) // detach l2 from l1 for _, l3 := range l3s { // attach l3 to l1, possible wrapper of <a> or <span> l1.InsertBefore(l3, nil) // insert at end // wrap := html.Node{Type: html.ElementNode, Data: "p", Attr: []html.Attribute{html.Attribute{Key: "cfrm", Val: "div"}}} // wrap.FirstChild = c1 // l1.InsertBefore(&wrap, nil) } } } }
func breakoutImagesFromAnchorTrees(n *html.Node) { for c := n.FirstChild; c != nil; c = c.NextSibling { breakoutImagesFromAnchorTrees(c) } if n.Type == html.ElementNode && n.Data == "a" { img, lvl := searchImg(n, nil, 0) if img != nil { only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild if lvl == 1 && only1Child { // log.Printf("only child image lvl %v a\n", lvl) n.RemoveChild(img) n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end" contnt := urlBeautify(attrX(n.Attr, "href")) if len(contnt) < 6 { contnt = "[was img] " + contnt } n.AppendChild(dom.Nd("text", contnt)) } else { if debugBreakOut { b0 := dom.PrintSubtree(n) log.Printf("\n%s\n", b0) } // log.Printf(" got it %v\n", img.Data) a1 := dom.CloneNodeWithSubtree(n) fc1 := closureDeleter(true) fc1(n, 0, false) if debugBreakOut { b1 := dom.PrintSubtree(n) log.Printf("\n%s\n", b1) } fc2 := closureDeleter(false) fc2(a1, 0, false) if debugBreakOut { b2 := dom.PrintSubtree(a1) log.Printf("\n%s\n", b2) log.Printf("--------------------\n") } if true { n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end" n.Parent.InsertBefore(a1, img.NextSibling) } else { // old way ; sequence corrpution if n had rightwise siblings. n.Parent.AppendChild(img) n.Parent.AppendChild(a1) } } // changing image to link later } else { // log.Printf("no img in a\n") } } }