func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
// CloneNode makes a copy of a Node with all descendants. func CloneNode(n *exphtml.Node) *exphtml.Node { clone := new(exphtml.Node) clone.Type = n.Type clone.DataAtom = n.DataAtom clone.Data = n.Data clone.Attr = make([]exphtml.Attribute, len(n.Attr)) copy(clone.Attr, n.Attr) for c := n.FirstChild; c != nil; c = c.NextSibling { nc := CloneNode(c) clone.AppendChild(nc) } return clone }
func (this *cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document { if this.config.debug { log.Println("Starting to replace bad divs...") } badDivs := 0 convertedTextNodes := 0 divs := doc.Find(domType) tags := []string{"a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"} divs.Each(func(i int, div *goquery.Selection) { if this.config.parser.getElementsByTags(div, tags).Size() == 0 { this.replaceWithPara(div) badDivs++ } else { replacementText := make([]string, 0) nodesToRemove := list.New() children := div.Contents() if this.config.debug { log.Printf("Found %d children of div\n", children.Size()) } children.EachWithBreak(func(i int, kid *goquery.Selection) bool { text := kid.Text() kidNode := kid.Get(0) tag := kidNode.Data if tag == text { tag = "#text" } if tag == "#text" { text = strings.Replace(text, "\n", "", -1) text = tabsRegEx.ReplaceAllString(text, "") if text == "" { return true } if len(text) > 1 { prev := kidNode.PrevSibling if this.config.debug { log.Printf("PARENT CLASS: %s NODENAME: %s\n", this.config.parser.name("class", div), tag) log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1)) } if prev != nil && prev.DataAtom == atom.A { nodeSelection := kid.HasNodes(prev) html, _ := nodeSelection.Html() replacementText = append(replacementText, html) if this.config.debug { log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html) } } replacementText = append(replacementText, text) nodesToRemove.PushBack(kidNode) convertedTextNodes++ } } return true }) newNode := new(html.Node) newNode.Type = html.ElementNode newNode.Data = strings.Join(replacementText, "") newNode.DataAtom = atom.P div.First().AddNodes(newNode) for s := nodesToRemove.Front(); s != nil; s = s.Next() { node := s.Value.(*html.Node) if node != nil && node.Parent != nil { node.Parent.RemoveChild(node) } } } }) if this.config.debug { log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes) } return doc }