// cleansDom performs brute reduction and simplification // func cleanseDom(n *html.Node, lvl int) { n.Attr = removeAttr(n.Attr, unwantedAttrs) // Children for c := n.FirstChild; c != nil; c = c.NextSibling { cleanseDom(c, lvl+1) } if directlyRemoveUnwanted { removeUnwanted(n) } else { convertUnwanted(n) } // --- convertExotic(n) // one time text normalization if n.Type == html.TextNode { n.Data = stringspb.NormalizeInnerWhitespace(n.Data) } }
func closureTextNodeExists(img *html.Node) (found bool) { txt := attrX(img.Attr, "title") if len(txt) < 5 { return false } txt = stringspb.NormalizeInnerWhitespace(txt) txt = strings.TrimSpace(txt) // We dont search entire document, but three levels above image subtree grandParent := img for i := 0; i < 4; i++ { if grandParent.Parent != nil { grandParent = grandParent.Parent } else { // log.Printf("LevelsUp %v for %q", i, txt) break } } var recurseTextNodes func(n *html.Node) recurseTextNodes = func(n *html.Node) { if found { return } cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { recurseTextNodes(c) } if n.Type == html.TextNode { n.Data = stringspb.NormalizeInnerWhitespace(n.Data) if len(n.Data) >= len(txt) { // if strings.Contains(txt, "FDP") { // log.Printf("%25v %v", stringspb.Ellipsoider(txt, 10), stringspb.Ellipsoider(n.Data, 10)) // } fnd := strings.Contains(n.Data, txt) if fnd { found = true return } } } } recurseTextNodes(grandParent) return }
func removeAttr(attributes []html.Attribute, removeKeys map[string]bool) []html.Attribute { ret := []html.Attribute{} var alt, title string for _, a := range attributes { a.Key = strings.TrimSpace(a.Key) a.Val = strings.TrimSpace(a.Val) a.Val = stringspb.NormalizeInnerWhitespace(a.Val) // having encountered title or alt values with newlines if removeKeys[a.Key] || strings.HasPrefix(a.Key, "data") { // } else { if a.Key == "alt" { alt = a.Val } if a.Key == "title" { title = a.Val } attrDistinct[a.Key]++ ret = append(ret, a) } } // normalize on title if alt != "" && alt == title { ret1 := []html.Attribute{} for i := 0; i < len(ret); i++ { if ret[i].Key != "alt" { ret1 = append(ret1, ret[i]) } } ret = ret1 } // remove both if alt == "" && alt == title { ret1 := []html.Attribute{} for i := 0; i < len(ret); i++ { if ret[i].Key != "alt" && ret[i].Key != "title" { ret1 = append(ret1, ret[i]) } } ret = ret1 } return ret }
func textifyNodeSubtree(n *html.Node) { if n.Type == html.ElementNode { nd := dom.Nd("text") nd.Data = textifySubtreeBruteForce(n, 0) nd.Data = stringspb.NormalizeInnerWhitespace(nd.Data) cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { n.RemoveChild(c) } n.AppendChild(nd) nd2 := dom.Nd("br") dom.InsertAfter(n, nd2) } }