func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
//checks the density of links within a node, is there not much text and most of it contains bad links? //if so it's no good func (this *contentExtractor) isHighLinkDensity(node *goquery.Selection) bool { links := node.Find("a") if links == nil || links.Size() == 0 { return false } text := node.Text() words := strings.Split(text, " ") nwords := len(words) sb := make([]string, 0) links.Each(func(i int, s *goquery.Selection) { linkText := s.Text() sb = append(sb, linkText) }) linkText := strings.Join(sb, "") linkWords := strings.Split(linkText, " ") nlinkWords := len(linkWords) nlinks := links.Size() linkDivisor := float64(nlinkWords) / float64(nwords) score := linkDivisor * float64(nlinks) if this.config.debug { logText := "" if len(node.Text()) >= 51 { logText = node.Text()[0:50] } else { logText = node.Text() } log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText) } if score > 1.0 { return true } return false }
func (p Parser) getElementsByTags(div *goquery.Selection, tags []string) *goquery.Selection { selection := new(goquery.Selection) for _, tag := range tags { selections := div.Find(tag) if selections != nil { selection = selection.Union(selections) } } return selection }
func (this *contentExtractor) isTableAndNoParaExist(selection *goquery.Selection) bool { subParagraph := selection.Find("p") subParagraph.Each(func(i int, s *goquery.Selection) { txt := s.Text() if len(txt) < 25 { node := s.Get(0) parent := node.Parent parent.RemoveChild(node) } }) subParagraph2 := selection.Find("p") if subParagraph2.Length() == 0 && selection.Get(0).DataAtom.String() != "td" { return true } return false }
//we could have long articles that have tons of paragraphs so if we tried to calculate the base score against //the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring //of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of //100 then 100 should be our base. func (this *contentExtractor) getSiblingsScore(topNode *goquery.Selection) int { base := 100000 paragraphNumber := 0 paragraphScore := 0 nodesToCheck := topNode.Find("p") nodesToCheck.Each(func(i int, s *goquery.Selection) { textNode := s.Text() ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, textNode) highLinkDensity := this.isHighLinkDensity(s) if ws.stopWordCount > 2 && !highLinkDensity { paragraphNumber++ paragraphScore += ws.stopWordCount } }) if paragraphNumber > 0 { base = paragraphScore / paragraphNumber } return base }
func (ve *VideoExtractor) getObjectTag(node *goquery.Selection) video { childEmbedTag := node.Find("embed") if ve.candidates.Has(childEmbedTag) { ve.candidates.Remove(childEmbedTag) } srcNode := node.Find(`param[name="movie"]`) if srcNode == nil || srcNode.Length() == 0 { return video{} } src, _ := srcNode.Attr("value") provider := ve.getProvider(src) if provider == "" { return video{} } video := ve.getVideo(node) video.provider = provider video.src = src return video }