// cleansDom performs brute reduction and simplification // func cleanseDom(n *html.Node, lvl int) { n.Attr = removeAttr(n.Attr, unwantedAttrs) // Children for c := n.FirstChild; c != nil; c = c.NextSibling { cleanseDom(c, lvl+1) } if directlyRemoveUnwanted { removeUnwanted(n) } else { convertUnwanted(n) } // --- convertExotic(n) // one time text normalization if n.Type == html.TextNode { n.Data = stringspb.NormalizeInnerWhitespace(n.Data) } }
func (m *minificationHTML) toDiv(node *html.Node) (*html.Node, error) { node.DataAtom = atom.Div node.Data = "div" node.Attr = nil return m.parseChildren(node) }
func FindTitleAndBody_MK(node *html.Node) (*html.Node, *html.Node) { var title, fulltext *html.Node if node.Type == html.ElementNode { for _, tag := range node.Attr { if tag.Key == "class" { if tag.Val == "content" { title = FindTitleMK(node) node.Data = "body" fulltext = node break } } } } for c := node.FirstChild; c != nil; c = c.NextSibling { ptitle, pfulltext := FindTitleAndBody_MK(c) if ptitle != nil { title = ptitle title.Data = "title" } if pfulltext != nil { fulltext = pfulltext } if title != nil && fulltext != nil { break } } return title, fulltext }
func (u *parserUtils) mergeNodes(parent, prev, next *html.Node, addSeparator bool) *html.Node { prevText := prev != nil && prev.Type == html.TextNode nextText := next != nil && next.Type == html.TextNode delim := "" if addSeparator { delim = " " } if prevText && nextText { prev.Data = prev.Data + delim + next.Data parent.RemoveChild(next) return prev.NextSibling } if prevText { prev.Data = prev.Data + delim } else if nextText { next.Data = delim + next.Data } else if addSeparator { newNode := &html.Node{ Type: html.TextNode, Data: delim} parent.InsertBefore(newNode, next) } return next }
// finds article's title and body in ria.ru html style // works cleary on 15.12.2015 func FindTitleAndBody_Ria(node *html.Node) (*html.Node, *html.Node) { var title, fulltext *html.Node if node.Type == html.ElementNode { for _, tag := range node.Attr { if tag.Key == "itemprop" { if tag.Val == "articleBody" { node.Data = "body" fulltext = node break } if tag.Val == "name" { node.Data = "title" title = node break } } } } for c := node.FirstChild; c != nil; c = c.NextSibling { ptitle, pfulltext := FindTitleAndBody_Ria(c) if ptitle != nil { title = ptitle } if pfulltext != nil { fulltext = pfulltext } if title != nil && fulltext != nil { break } } return title, fulltext }
func reIndent(n *html.Node, lvl int) { if lvl > cScaffoldLvls && n.Parent == nil { bb := dom.PrintSubtree(n) _ = bb // log.Printf("%s", bb.Bytes()) hint := "" if ml3[n] > 0 { hint = " from ml3" } log.Print("reIndent: no parent ", hint) return } // Before children processing switch n.Type { case html.ElementNode: if lvl > cScaffoldLvls && n.Parent.Type == html.ElementNode { ind := strings.Repeat("\t", lvl-2) dom.InsertBefore(n, &html.Node{Type: html.TextNode, Data: "\n" + ind}) } case html.CommentNode: dom.InsertBefore(n, &html.Node{Type: html.TextNode, Data: "\n"}) case html.TextNode: n.Data = strings.TrimSpace(n.Data) + " " if !strings.HasPrefix(n.Data, ",") && !strings.HasPrefix(n.Data, ".") { n.Data = " " + n.Data } // link texts without trailing space if n.Parent != nil && n.Parent.Data == "a" { n.Data = strings.TrimSpace(n.Data) } } // Children for c := n.FirstChild; c != nil; c = c.NextSibling { reIndent(c, lvl+1) } // After children processing switch n.Type { case html.ElementNode: // I dont know why, // but this needs to happend AFTER the children if lvl > cScaffoldLvls && n.Parent.Type == html.ElementNode { ind := strings.Repeat("\t", lvl-2) ind = "\n" + ind // link texts without new line if n.Data == "a" { ind = "" } if n.LastChild != nil { dom.InsertAfter(n.LastChild, &html.Node{Type: html.TextNode, Data: ind}) } } } }
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
func img2Link(img *html.Node) { if img.Data == "img" { img.Data = "a" for i := 0; i < len(img.Attr); i++ { if img.Attr[i].Key == "src" { img.Attr[i].Key = "href" } } double := closureTextNodeExists(img) imgContent := "" title := attrX(img.Attr, "title") if double { imgContent = fmt.Sprintf("[img] %v %v | ", "[ctdr]", // content title double removed urlBeautify(attrX(img.Attr, "href"))) } else { imgContent = fmt.Sprintf("[img] %v %v | ", title, urlBeautify(attrX(img.Attr, "href"))) } img.Attr = attrSet(img.Attr, "cfrom", "img") nd := dom.Nd("text", imgContent) img.AppendChild(nd) } }
func copyNode(to, from *html.Node) { to.Attr = from.Attr to.Data = from.Data to.DataAtom = from.DataAtom to.Namespace = from.Namespace to.Type = from.Type }
func (m *minificationText) parseText(node *html.Node) (*html.Node, error) { next := node.NextSibling text := m.processText(node.Data) if len(text) != 0 { node.Data = text } else { node.Parent.RemoveChild(node) } return next, nil }
func Nd(ntype string, content ...string) *html.Node { nd0 := new(html.Node) if ntype == "text" { nd0.Type = html.TextNode if len(content) > 0 { nd0.Data = content[0] } } else { nd0.Type = html.ElementNode nd0.Data = ntype if len(content) > 0 { runtimepb.StackTrace(4) log.Printf("Element nodes can't have content") } } return nd0 }
func walkPrint(w io.Writer, i int, n *html.Node) { for ; n != nil; n = n.NextSibling { if n.Type == html.TextNode && strings.TrimSpace(n.Data) == "" { continue } d := getData(n) isMostChild := getData(n.Parent).Child == n if isMostChild { w.Write([]byte(`<div style="background: rgba(0, 0, 100, 0.1)">`)) } if d.Chosen || d.ChosenBy { color := "rgb(40, 79, 40)" if d.ChosenBy { color = "rgba(90, 60, 30, 0.8)" } w.Write([]byte(`<div id="chosen" style="background: ` + color + `;color: #fff">`)) } factor := 0 if d.Count > 0 { factor = d.MaxChild * 100 / d.Count } if len([]rune(n.Data)) > 40 { n.Data = string([]rune(n.Data)[:40]) } if n.Type == html.ElementNode { fmt.Fprintf(w, "%v<%v>", strings.Repeat("\t", i), n.Data) fmt.Fprintf(w, " (%v/%v = <b>%v%%</b>) - %v\n", d.MaxChild, d.Count, factor, n.Attr, ) } else { fmt.Fprintf(w, "%v%v\n", strings.Repeat("\t", i), strconv.Quote(ghtml.EscapeString(n.Data))) } if n.FirstChild != nil { walkPrint(w, i+1, n.FirstChild) } if isMostChild { w.Write([]byte(`</div>`)) } if d.Chosen || d.ChosenBy { w.Write([]byte("</div>")) } } }
func TestParseATagNoHref(t *testing.T) { node := new(html.Node) node.Data = "a" page := newWebPage(startUrl) page.parseATag(node) expected1 := 0 val1 := page.links.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
// CloneNode makes a copy of a Node with all descendants. func CloneNode(n *exphtml.Node) *exphtml.Node { clone := new(exphtml.Node) clone.Type = n.Type clone.DataAtom = n.DataAtom clone.Data = n.Data clone.Attr = make([]exphtml.Attribute, len(n.Attr)) copy(clone.Attr, n.Attr) for c := n.FirstChild; c != nil; c = c.NextSibling { nc := CloneNode(c) clone.AppendChild(nc) } return clone }
func TestParseScriptTagNoSrc(t *testing.T) { node := new(html.Node) node.Data = "script" page := newWebPage(startUrl) page.parseScriptTag(node) expected1 := 0 val1 := page.scriptFiles.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func removeEmptyNodes(n *html.Node, lvl int) { // children cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { removeEmptyNodes(c, lvl+1) } // processing // empty element nodes if n.Type == html.ElementNode && n.Data == "img" { src := attrX(n.Attr, "src") if src == "" { n.Parent.RemoveChild(n) } } if n.Type == html.ElementNode && n.FirstChild == nil && n.Data == "a" { href := attrX(n.Attr, "href") if href == "#" || href == "" { n.Parent.RemoveChild(n) } } if n.Type == html.ElementNode && n.FirstChild == nil && (n.Data == "em" || n.Data == "strong") { n.Parent.RemoveChild(n) } if n.Type == html.ElementNode && n.FirstChild == nil && (n.Data == "div" || n.Data == "span" || n.Data == "li" || n.Data == "p") { n.Parent.RemoveChild(n) } // spans with less than 2 characters inside => flatten to text only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild if n.Type == html.ElementNode && n.Data == "span" && only1Child && n.FirstChild.Type == html.TextNode && len(strings.TrimSpace(n.FirstChild.Data)) < 3 { n.Type = html.TextNode n.Data = n.FirstChild.Data n.RemoveChild(n.FirstChild) } }
func TestParseATagAbsoluteDiffHost(t *testing.T) { node := new(html.Node) node.Data = "a" attr := html.Attribute{"", "href", "http://www.google.com"} node.Attr = []html.Attribute{attr} page := newWebPage(startUrl) page.parseATag(node) expected := 0 val := page.links.Len() if val != expected { t.Error("Expected:", expected, " Got:", val) } }
func TestParseLinkTagNoRel(t *testing.T) { node := new(html.Node) node.Data = "link" attr1 := html.Attribute{"", "href", "1.css"} node.Attr = []html.Attribute{attr1} page := newWebPage(startUrl) page.parseLinkTag(node) expected1 := 0 val1 := page.styleSheets.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func TestParseATagInvalidUrl(t *testing.T) { node := new(html.Node) node.Data = "a" attr := html.Attribute{"", "href", "%gh&%ij"} node.Attr = []html.Attribute{attr} page := newWebPage(startUrl) page.parseATag(node) expected1 := 0 val1 := page.links.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func TestParseLinkTagInvalidUrl(t *testing.T) { node := new(html.Node) node.Data = "link" attr1 := html.Attribute{"", "href", "%gh&%ij"} attr2 := html.Attribute{"", "rel", "stylesheet"} node.Attr = []html.Attribute{attr1, attr2} page := newWebPage(startUrl) page.parseLinkTag(node) expected1 := 0 val1 := page.styleSheets.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func dedupApply(n *html.Node, dedups map[string]bool) { // Children for c := n.FirstChild; c != nil; c = c.NextSibling { dedupApply(c, dedups) } if n.Type == html.ElementNode { outline := attrX(n.Attr, "ol") + "." if dedups[outline] { n.Type = html.CommentNode n.Data = n.Data + " replaced" } } }
// clean normalises styles/colspan and removes any CleanTags specified, along with newlines; // but also makes all the character handling (for example " " as utf-8) the same. // It returns the estimated number of treeRunes that will be used. // TODO more cleaning of the input HTML, as required. func (c *Config) clean(n *html.Node) int { size := 1 switch n.Type { case html.ElementNode: for ai := 0; ai < len(n.Attr); ai++ { a := n.Attr[ai] switch { case strings.ToLower(a.Key) == "style": if strings.TrimSpace(a.Val) == "" { // delete empty styles n.Attr = delAttr(n.Attr, ai) ai-- } else { // tidy non-empty styles // TODO there could be more here to make sure the style entries are in the same order etc. n.Attr[ai].Val = strings.Replace(a.Val, " ", "", -1) if !strings.HasSuffix(n.Attr[ai].Val, ";") { n.Attr[ai].Val += ";" } } case n.DataAtom == atom.Td && strings.ToLower(a.Key) == "colspan" && strings.TrimSpace(a.Val) == "1": n.Attr = delAttr(n.Attr, ai) ai-- } } case html.TextNode: n.Data = htm.UnescapeString(n.Data) size += utf8.RuneCountInString(n.Data) - 1 // len(n.Data) would be faster, but use more memory } searchChildren: for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { switch ch.Type { case html.ElementNode: for _, rr := range c.CleanTags { if rr == ch.Data { n.RemoveChild(ch) goto searchChildren } } } size += c.clean(ch) } return size }
func TestParseATagRelative(t *testing.T) { node := new(html.Node) node.Data = "a" attr := html.Attribute{"", "href", "1.html"} node.Attr = []html.Attribute{attr} page := newWebPage(startUrl) page.parseATag(node) expected1 := 1 val1 := page.links.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } expected2 := startUrl + "1.html" val2 := page.links.Front().Value if val2 != expected2 { t.Error("Expected:", expected2, " Got:", val2) } }
func TestParseScriptTagAbsolute(t *testing.T) { node := new(html.Node) node.Data = "script" attr := html.Attribute{"", "src", startUrl + "1.js"} node.Attr = []html.Attribute{attr} page := newWebPage(startUrl) page.parseScriptTag(node) expected1 := 1 val1 := page.scriptFiles.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } expected2 := startUrl + "1.js" val2 := page.scriptFiles.Front().Value if val2 != expected2 { t.Error("Expected:", expected2, " Got:", val2) } }
func forceMaxDepth(n *html.Node, depth int) { if depth == 0 { n.Type = html.TextNode n.FirstChild, n.LastChild = nil, nil n.Attr = nil n.Data = "[omitted]" for n.NextSibling != nil { n.Parent.RemoveChild(n.NextSibling) } return } if n.Type != html.ElementNode { return } for c := n.FirstChild; c != nil; c = c.NextSibling { forceMaxDepth(c, depth-1) } }
// append1 actually appends to the merged HTML node tree. func (ap *appendContext) append1(action rune, text string, proto *html.Node, pos posT) { if proto == nil { return } appendPoint, protoAncestor := ap.lastMatchingLeaf(proto, action, pos) if appendPoint == nil || protoAncestor == nil { return } if appendPoint.DataAtom != protoAncestor.DataAtom { return } newLeaf := new(html.Node) copyNode(newLeaf, proto) if proto.Type == html.TextNode { newLeaf.Data = text } if action != '=' { insertNode := &html.Node{ Type: html.ElementNode, DataAtom: atom.Span, Data: "span", } switch action { case '+': insertNode.Attr = convertAttributes(ap.c.InsertedSpan) case '-': insertNode.Attr = convertAttributes(ap.c.DeletedSpan) case '~': insertNode.Attr = convertAttributes(ap.c.ReplacedSpan) } insertNode.AppendChild(newLeaf) newLeaf = insertNode } for proto = proto.Parent; proto != nil && proto != protoAncestor; proto = proto.Parent { above := new(html.Node) copyNode(above, proto) above.AppendChild(newLeaf) newLeaf = above } appendPoint.AppendChild(newLeaf) }
func TestParseLinkTagRelative(t *testing.T) { node := new(html.Node) node.Data = "link" attr1 := html.Attribute{"", "href", "1.css"} attr2 := html.Attribute{"", "rel", "stylesheet"} node.Attr = []html.Attribute{attr1, attr2} page := newWebPage(startUrl) page.parseLinkTag(node) expected1 := 1 val1 := page.styleSheets.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } expected2 := startUrl + "1.css" val2 := page.styleSheets.Front().Value if val2 != expected2 { t.Error("Expected:", expected2, " Got:", val2) } }
// convertExotic standardizes <section> or <header> nodes // towards <div> nodes. func convertExotic(n *html.Node) { if repl, ok := exotics[n.Data]; ok { n.Attr = append(n.Attr, html.Attribute{"", "cfrm", n.Data}) n.Data = repl } }
func (c *Cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document { if c.config.debug { log.Println("Starting to replace bad divs...") } badDivs := 0 convertedTextNodes := 0 divs := doc.Find(domType) divs.Each(func(i int, div *goquery.Selection) { divHTML, _ := div.Html() if divToPElementsPattern.Match([]byte(divHTML)) { c.replaceWithPara(div) badDivs++ } else { var replacementText []string nodesToRemove := list.New() children := div.Contents() if c.config.debug { log.Printf("Found %d children of div\n", children.Size()) } children.EachWithBreak(func(i int, kid *goquery.Selection) bool { text := kid.Text() kidNode := kid.Get(0) tag := kidNode.Data if tag == text { tag = "#text" } if tag == "#text" { text = strings.Replace(text, "\n", "", -1) text = tabsRegEx.ReplaceAllString(text, "") if text == "" { return true } if len(text) > 1 { prev := kidNode.PrevSibling if c.config.debug { log.Printf("PARENT CLASS: %s NODENAME: %s\n", c.config.parser.name("class", div), tag) log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1)) } if prev != nil && prev.DataAtom == atom.A { nodeSelection := kid.HasNodes(prev) html, _ := nodeSelection.Html() replacementText = append(replacementText, html) if c.config.debug { log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html) } } replacementText = append(replacementText, text) nodesToRemove.PushBack(kidNode) convertedTextNodes++ } } return true }) newNode := new(html.Node) newNode.Type = html.ElementNode newNode.Data = strings.Join(replacementText, "") newNode.DataAtom = atom.P div.First().AddNodes(newNode) for s := nodesToRemove.Front(); s != nil; s = s.Next() { node := s.Value.(*html.Node) if node != nil && node.Parent != nil { node.Parent.RemoveChild(node) } } } }) if c.config.debug { log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes) } return doc }
//从nodes中找到node 根据index 和 属性 先index func findNodeformNodesbyIndexOrPro(nodes []*goquery.Selection, index *int, m map[string]string, Type string, visible bool) { switch { case Type == OPTION || Type == RADIO: for _, v := range nodes { for _, vv := range v.Get(0).Attr { if vv.Key == VALUE { if vv.Val == m[VALUE] { if Type == RADIO { v.SetAttr("checked", "checked") } else { v.SetAttr("selected", "selected") } return } } } } if visible { var node html.Node node.Data = nodes[0].Get(0).Data node.Type = nodes[0].Get(0).Type attr := make([]html.Attribute, 0, 2) var tr html.Attribute tr.Key = VALUE tr.Val = m[VALUE] attr = append(attr, tr) if Type == RADIO { tr.Key = "checked" tr.Val = "checked" } else { tr.Key = "selected" tr.Val = "selected" } attr = append(attr, tr) tr.Key = TYPE tr.Val = Type attr = append(attr, tr) node.Attr = attr nodes[0].Parent().AppendNodes(&node) } return default: } if len(nodes) <= *index { return } for k, v := range m { nodes[*index].SetAttr(k, v) } *index++ }