// cleansDom performs brute reduction and simplification // func cleanseDom(n *html.Node, lvl int) { n.Attr = removeAttr(n.Attr, unwantedAttrs) // Children for c := n.FirstChild; c != nil; c = c.NextSibling { cleanseDom(c, lvl+1) } if directlyRemoveUnwanted { removeUnwanted(n) } else { convertUnwanted(n) } // --- convertExotic(n) // one time text normalization if n.Type == html.TextNode { n.Data = stringspb.NormalizeInnerWhitespace(n.Data) } }
func (m *minificationHTML) toDiv(node *html.Node) (*html.Node, error) { node.DataAtom = atom.Div node.Data = "div" node.Attr = nil return m.parseChildren(node) }
// Attache attaches these attributes to an html.Node. func (g HTML) Attach(node *html.Node) { attrs := []html.Attribute{} if g.ContentEditable > 0 { if g.ContentEditable == OTrue { attrs = attr(attrs, "contenteditable", "true") } else { attrs = attr(attrs, "contenteditable", "false") } } if g.Hidden > 0 { if g.Hidden == OTrue { attrs = attr(attrs, "hidden", "true") } else { attrs = attr(attrs, "hidden", "false") } } if len(g.Data) > 0 { for k, v := range g.Data { attrs = attr(attrs, k, v) } } if len(g.Class) > 0 { v := strings.Join(g.Class, " ") attrs = attr(attrs, "class", v) } s := []string{"AccessKey", "Id", "Dir", "Lang", "Style", "TabIndex", "Title", "Translate"} attrs = append(attrs, structToAttrs(g, s...)...) node.Attr = append(node.Attr, attrs...) }
// convert nodes to /x/net/html.Node siblings. // Document node children are integrated as siblings. // Nils are skipped. func (s Siblings) convert(parent *html.Node) (first, last *html.Node) { var prev *html.Node for _, n := range s { if n == nil { continue } if n.Type == html.DocumentNode { start, end := n.Children.convert(parent) if prev != nil { prev.NextSibling = start } else { first = start } prev = end continue } h := n.convert() h.Parent = parent h.PrevSibling = prev if prev != nil { prev.NextSibling = h } else { first = h } prev = h } return first, prev }
// finds article's title and body in ria.ru html style // works cleary on 15.12.2015 func FindTitleAndBody_Ria(node *html.Node) (*html.Node, *html.Node) { var title, fulltext *html.Node if node.Type == html.ElementNode { for _, tag := range node.Attr { if tag.Key == "itemprop" { if tag.Val == "articleBody" { node.Data = "body" fulltext = node break } if tag.Val == "name" { node.Data = "title" title = node break } } } } for c := node.FirstChild; c != nil; c = c.NextSibling { ptitle, pfulltext := FindTitleAndBody_Ria(c) if ptitle != nil { title = ptitle } if pfulltext != nil { fulltext = pfulltext } if title != nil && fulltext != nil { break } } return title, fulltext }
func FindTitleAndBody_MK(node *html.Node) (*html.Node, *html.Node) { var title, fulltext *html.Node if node.Type == html.ElementNode { for _, tag := range node.Attr { if tag.Key == "class" { if tag.Val == "content" { title = FindTitleMK(node) node.Data = "body" fulltext = node break } } } } for c := node.FirstChild; c != nil; c = c.NextSibling { ptitle, pfulltext := FindTitleAndBody_MK(c) if ptitle != nil { title = ptitle title.Data = "title" } if pfulltext != nil { fulltext = pfulltext } if title != nil && fulltext != nil { break } } return title, fulltext }
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
func wrapText(nodes []*html.Node) []*html.Node { wrapped := make([]*html.Node, 0, len(nodes)) var wrapper *html.Node appendWrapper := func() { if wrapper != nil { // render and re-parse so p-inline-p expands wrapped = append(wrapped, ParseDepth(Render(wrapper), 0)...) wrapper = nil } } for _, n := range nodes { if n.Type == html.ElementNode && isBlockElement[n.DataAtom] { appendWrapper() wrapped = append(wrapped, n) continue } if wrapper == nil && n.Type == html.TextNode && strings.TrimSpace(n.Data) == "" { wrapped = append(wrapped, n) continue } if wrapper == nil { wrapper = &html.Node{ Type: html.ElementNode, Data: "p", DataAtom: atom.P, } } wrapper.AppendChild(n) } appendWrapper() return wrapped }
func CompactNode(n *html.Node) { var appendNodes []*html.Node for c := n.FirstChild; c != nil; { CompactNode(c) if _mergeTextElements[c.Data] { appendNodes = append(appendNodes, GetChildNodes(c)...) log.Info("delete", c.Data) c = RemoveNode(c) } else if c.Type == html.ElementNode && c.FirstChild == nil && !_voidElements[c.Data] { log.Info("delete", c.Data) c = RemoveNode(c) } else { c = c.NextSibling } } DetachNodes(appendNodes) AppendChildNodes(n, appendNodes) if n.FirstChild != nil && n.FirstChild.NextSibling == nil { if n.FirstChild.Data == n.Data || (n.FirstChild.Data == "br" && (n.Data == "p" || n.Data == "div")) { childNodes := GetChildNodes(n.FirstChild) log.Info("delete", n.FirstChild.Data) n.RemoveChild(n.FirstChild) DetachNodes(childNodes) AppendChildNodes(n, childNodes) } else if n.FirstChild.Data == "img" && n.Data == "a" { *n = *n.FirstChild } } }
func img2Link(img *html.Node) { if img.Data == "img" { img.Data = "a" for i := 0; i < len(img.Attr); i++ { if img.Attr[i].Key == "src" { img.Attr[i].Key = "href" } } double := closureTextNodeExists(img) imgContent := "" title := attrX(img.Attr, "title") if double { imgContent = fmt.Sprintf("[img] %v %v | ", "[ctdr]", // content title double removed urlBeautify(attrX(img.Attr, "href"))) } else { imgContent = fmt.Sprintf("[img] %v %v | ", title, urlBeautify(attrX(img.Attr, "href"))) } img.Attr = attrSet(img.Attr, "cfrom", "img") nd := dom.Nd("text", imgContent) img.AppendChild(nd) } }
func copyNode(to, from *html.Node) { to.Attr = from.Attr to.Data = from.Data to.DataAtom = from.DataAtom to.Namespace = from.Namespace to.Type = from.Type }
func removeNegativeAttributeMatches(n *html.Node) *html.Node { for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type != html.TextNode && containerregrex.MatchString(c.Data) { for _, attr := range c.Attr { key := strings.ToLower(attr.Key) if key == "id" || key == "class" { val := strings.ToLower(attr.Val) values := nonwordregex.Split(val, -1) penalty := 0 for _, value := range values { if negativeregex.MatchString(value) { penalty = penalty + 4 } } if penalty > 0 { if c.PrevSibling != nil { c.PrevSibling.NextSibling = c.NextSibling } else { n.FirstChild = c.NextSibling } } else { d := removeNegativeAttributeMatches(c) if c.PrevSibling != nil { c.PrevSibling.NextSibling = d } else { n.FirstChild = c.NextSibling } } } } } } return n }
func reIndent(n *html.Node, lvl int) { if lvl > cScaffoldLvls && n.Parent == nil { bb := dom.PrintSubtree(n) _ = bb // log.Printf("%s", bb.Bytes()) hint := "" if ml3[n] > 0 { hint = " from ml3" } log.Print("reIndent: no parent ", hint) return } // Before children processing switch n.Type { case html.ElementNode: if lvl > cScaffoldLvls && n.Parent.Type == html.ElementNode { ind := strings.Repeat("\t", lvl-2) dom.InsertBefore(n, &html.Node{Type: html.TextNode, Data: "\n" + ind}) } case html.CommentNode: dom.InsertBefore(n, &html.Node{Type: html.TextNode, Data: "\n"}) case html.TextNode: n.Data = strings.TrimSpace(n.Data) + " " if !strings.HasPrefix(n.Data, ",") && !strings.HasPrefix(n.Data, ".") { n.Data = " " + n.Data } // link texts without trailing space if n.Parent != nil && n.Parent.Data == "a" { n.Data = strings.TrimSpace(n.Data) } } // Children for c := n.FirstChild; c != nil; c = c.NextSibling { reIndent(c, lvl+1) } // After children processing switch n.Type { case html.ElementNode: // I dont know why, // but this needs to happend AFTER the children if lvl > cScaffoldLvls && n.Parent.Type == html.ElementNode { ind := strings.Repeat("\t", lvl-2) ind = "\n" + ind // link texts without new line if n.Data == "a" { ind = "" } if n.LastChild != nil { dom.InsertAfter(n.LastChild, &html.Node{Type: html.TextNode, Data: ind}) } } } }
/* div div div p p TO img img p p Operates from the *middle* div. Saves all children in inverted slice. Removes each child and reattaches it one level higher. Finally the intermediary, now childless div is removed. \ / \ /\ / \_____/ \_____/ \ / \_____/\_____/ \__________/ => Breaks are gone \p1___p2___/ => Wrapping preserves breaks */ func topDownV1(n *html.Node, couple []string, parentType string) { if noParent(n) { return } p := n.Parent parDiv := p.Type == html.ElementNode && p.Data == couple[0] // Parent is a div iAmDiv := n.Type == html.ElementNode && n.Data == couple[1] // I am a div noSiblings := n.PrevSibling == nil && n.NextSibling == nil only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild svrlChildn := n.FirstChild != nil && n.FirstChild != n.LastChild noChildren := n.FirstChild == nil _, _ = noSiblings, noChildren if parDiv && iAmDiv { if only1Child || svrlChildn { var children []*html.Node for c := n.FirstChild; c != nil; c = c.NextSibling { children = append([]*html.Node{c}, children...) // order inversion } insertionPoint := n.NextSibling for _, c1 := range children { n.RemoveChild(c1) if c1.Type == html.TextNode || c1.Data == "a" { // pf("wrapping %v\n", NodeTypeStr(c1.Type)) wrap := html.Node{Type: html.ElementNode, Data: "p", Attr: []html.Attribute{html.Attribute{Key: "cfrm", Val: "div"}}} wrap.FirstChild = c1 p.InsertBefore(&wrap, insertionPoint) c1.Parent = &wrap insertionPoint = &wrap } else { p.InsertBefore(c1, insertionPoint) insertionPoint = c1 } } p.RemoveChild(n) if p.Data != parentType { p.Data = parentType } } } }
func runMergeNodes(parent, prev, next *html.Node, addSeparator bool) *html.Node { var u parserUtils if prev != nil { parent.AppendChild(prev) } if next != nil { parent.AppendChild(next) } return u.mergeNodes(parent, prev, next, addSeparator) }
// We want to remove some children. // A direct loop is impossible, // since "NextSibling" is set to nil during Remove(). // Therefore: // First assemble children separately. // Then remove them. func removeUnwanted(n *html.Node) { cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { if unwanteds[c.Data] { n.RemoveChild(c) } } }
// Replace the given node's children with the given string. func setNodeText(node *html.Node, s string) { // remove all existing children for node.FirstChild != nil { node.RemoveChild(node.FirstChild) } // add the text node.AppendChild(&html.Node{ Type: html.TextNode, Data: s, }) }
func (m *minificationText) openTag(node *html.Node) { parent := node.Parent for it := node.FirstChild; it != nil; it = it.NextSibling { it.Parent = parent } parent.FirstChild = node.FirstChild parent.LastChild = node.LastChild node.FirstChild = nil node.LastChild = nil node.Parent = nil }
func removeUnwanted(n *html.Node) { cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { if n.Type == html.ElementNode && n.Data == "script" || n.Type == html.CommentNode { n.RemoveChild(c) } } }
func TestIsIcoNode(t *testing.T) { n := html.Node{Data: "link"} if isIcoNode(&n) { t.Errorf("Exepected node to not be a ico node") } n.Attr = []html.Attribute{ html.Attribute{Key: "rel", Val: "icon"}, } if !isIcoNode(&n) { t.Errorf("Exepected node to be a ico node") } }
func TestIsStylesheetNode(t *testing.T) { n := html.Node{Data: "link"} if isStylesheetNode(&n) { t.Errorf("Exepected node to not be a stylesheet node") } n.Attr = []html.Attribute{ html.Attribute{Key: "rel", Val: "stylesheet"}, } if !isStylesheetNode(&n) { t.Errorf("Exepected node to be a stylesheet node") } }
func replaceNodeWithChildren(n *html.Node) { var next *html.Node parent := n.Parent for c := n.FirstChild; c != nil; c = next { next = c.NextSibling n.RemoveChild(c) parent.InsertBefore(c, n) } parent.RemoveChild(n) }
func TestParseScriptTagNoSrc(t *testing.T) { node := new(html.Node) node.Data = "script" page := newWebPage(startUrl) page.parseScriptTag(node) expected1 := 0 val1 := page.scriptFiles.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func TestParseATagNoHref(t *testing.T) { node := new(html.Node) node.Data = "a" page := newWebPage(startUrl) page.parseATag(node) expected1 := 0 val1 := page.links.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func (u *parserUtils) addChildTextNodeToBegining(node *html.Node, text string) { if node.FirstChild != nil && node.FirstChild.Type == html.TextNode { node.FirstChild.Data = text + node.FirstChild.Data } else { newNode := &html.Node{ Type: html.TextNode, Data: text} if node.FirstChild == nil { node.AppendChild(newNode) } else { node.InsertBefore(newNode, node.FirstChild) } } }
func removeEmptyNodes(n *html.Node, lvl int) { // children cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { removeEmptyNodes(c, lvl+1) } // processing // empty element nodes if n.Type == html.ElementNode && n.Data == "img" { src := attrX(n.Attr, "src") if src == "" { n.Parent.RemoveChild(n) } } if n.Type == html.ElementNode && n.FirstChild == nil && n.Data == "a" { href := attrX(n.Attr, "href") if href == "#" || href == "" { n.Parent.RemoveChild(n) } } if n.Type == html.ElementNode && n.FirstChild == nil && (n.Data == "em" || n.Data == "strong") { n.Parent.RemoveChild(n) } if n.Type == html.ElementNode && n.FirstChild == nil && (n.Data == "div" || n.Data == "span" || n.Data == "li" || n.Data == "p") { n.Parent.RemoveChild(n) } // spans with less than 2 characters inside => flatten to text only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild if n.Type == html.ElementNode && n.Data == "span" && only1Child && n.FirstChild.Type == html.TextNode && len(strings.TrimSpace(n.FirstChild.Data)) < 3 { n.Type = html.TextNode n.Data = n.FirstChild.Data n.RemoveChild(n.FirstChild) } }
func TestParseLinkTagNoRel(t *testing.T) { node := new(html.Node) node.Data = "link" attr1 := html.Attribute{"", "href", "1.css"} node.Attr = []html.Attribute{attr1} page := newWebPage(startUrl) page.parseLinkTag(node) expected1 := 0 val1 := page.styleSheets.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func TestParseATagAbsoluteDiffHost(t *testing.T) { node := new(html.Node) node.Data = "a" attr := html.Attribute{"", "href", "http://www.google.com"} node.Attr = []html.Attribute{attr} page := newWebPage(startUrl) page.parseATag(node) expected := 0 val := page.links.Len() if val != expected { t.Error("Expected:", expected, " Got:", val) } }
func TestParseATagInvalidUrl(t *testing.T) { node := new(html.Node) node.Data = "a" attr := html.Attribute{"", "href", "%gh&%ij"} node.Attr = []html.Attribute{attr} page := newWebPage(startUrl) page.parseATag(node) expected1 := 0 val1 := page.links.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func RemoveAttributes(n *html.Node, keepAttrs []string) { attrs := make([]html.Attribute, 0, len(n.Attr)) dataAttrs := make(map[string]html.Attribute) originalAttrs := make(map[string]html.Attribute) for _, a := range n.Attr { if indexOfString(keepAttrs, a.Key) >= 0 { attrs = append(attrs, a) originalAttrs[a.Key] = a } else if i := strings.Index(a.Key, "data-"); i == 0 { a.Key = a.Key[5:] if len(a.Key) > 0 && indexOfString(keepAttrs, a.Key) >= 0 { dataAttrs[a.Key] = a } } } for k, v := range dataAttrs { if _, found := originalAttrs[k]; !found { attrs = append(attrs, v) } } n.Attr = attrs for c := n.FirstChild; c != nil; c = c.NextSibling { RemoveAttributes(c, keepAttrs) } }