func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
//adds any siblings that may have a decent score to this node func (this *contentExtractor) addSiblings(topNode *goquery.Selection) *goquery.Selection { if this.config.debug { log.Println("Starting to add siblings") } baselinescoreSiblingsPara := this.getSiblingsScore(topNode) results := this.walkSiblings(topNode) for _, currentNode := range results { ps := this.getSiblingsContent(currentNode, float64(baselinescoreSiblingsPara)) for _, p := range ps { nodes := make([]*html.Node, len(topNode.Nodes)+1) nodes[0] = p.Get(0) for i, node := range topNode.Nodes { nodes[i+1] = node } topNode.Nodes = nodes } } return topNode }
func (p Parser) clear(selection *goquery.Selection) { selection.Nodes = make([]*html.Node, 0) }