Ejemplo n.º 1
0
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
	ps := make([]*goquery.Selection, 0)
	if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
		ps = append(ps, currentSibling)
		return ps
	} else {
		potentialParagraphs := currentSibling.Find("p")
		potentialParagraphs.Each(func(i int, s *goquery.Selection) {
			text := s.Text()
			if len(text) > 0 {
				ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text)
				paragraphScore := ws.stopWordCount
				siblingBaselineScore := 0.30
				highLinkDensity := this.isHighLinkDensity(s)
				score := siblingBaselineScore * baselinescoreSiblingsPara
				if score < float64(paragraphScore) && !highLinkDensity {
					node := new(html.Node)
					node.Type = html.TextNode
					node.Data = text
					node.DataAtom = atom.P
					nodes := make([]*html.Node, 1)
					nodes[0] = node
					newSelection := new(goquery.Selection)
					newSelection.Nodes = nodes
					ps = append(ps, newSelection)
				}
			}

		})
	}
	return ps
}
Ejemplo n.º 2
0
//checks the density of links within a node, is there not much text and most of it contains bad links?
//if so it's no good
func (this *contentExtractor) isHighLinkDensity(node *goquery.Selection) bool {
	links := node.Find("a")
	if links == nil || links.Size() == 0 {
		return false
	}
	text := node.Text()
	words := strings.Split(text, " ")
	nwords := len(words)
	sb := make([]string, 0)
	links.Each(func(i int, s *goquery.Selection) {
		linkText := s.Text()
		sb = append(sb, linkText)
	})
	linkText := strings.Join(sb, "")
	linkWords := strings.Split(linkText, " ")
	nlinkWords := len(linkWords)
	nlinks := links.Size()
	linkDivisor := float64(nlinkWords) / float64(nwords)
	score := linkDivisor * float64(nlinks)

	if this.config.debug {
		logText := ""
		if len(node.Text()) >= 51 {
			logText = node.Text()[0:50]
		} else {
			logText = node.Text()
		}
		log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText)
	}
	if score > 1.0 {
		return true
	}
	return false
}
Ejemplo n.º 3
0
func (p Parser) getElementsByTags(div *goquery.Selection, tags []string) *goquery.Selection {
	selection := new(goquery.Selection)
	for _, tag := range tags {
		selections := div.Find(tag)
		if selections != nil {
			selection = selection.Union(selections)
		}
	}
	return selection
}
Ejemplo n.º 4
0
func (this *contentExtractor) isTableAndNoParaExist(selection *goquery.Selection) bool {
	subParagraph := selection.Find("p")
	subParagraph.Each(func(i int, s *goquery.Selection) {
		txt := s.Text()
		if len(txt) < 25 {
			node := s.Get(0)
			parent := node.Parent
			parent.RemoveChild(node)
		}
	})

	subParagraph2 := selection.Find("p")
	if subParagraph2.Length() == 0 && selection.Get(0).DataAtom.String() != "td" {
		return true
	}
	return false
}
Ejemplo n.º 5
0
//we could have long articles that have tons of paragraphs so if we tried to calculate the base score against
//the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring
//of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of
//100 then 100 should be our base.
func (this *contentExtractor) getSiblingsScore(topNode *goquery.Selection) int {
	base := 100000
	paragraphNumber := 0
	paragraphScore := 0
	nodesToCheck := topNode.Find("p")
	nodesToCheck.Each(func(i int, s *goquery.Selection) {
		textNode := s.Text()
		ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, textNode)
		highLinkDensity := this.isHighLinkDensity(s)
		if ws.stopWordCount > 2 && !highLinkDensity {
			paragraphNumber++
			paragraphScore += ws.stopWordCount
		}
	})
	if paragraphNumber > 0 {
		base = paragraphScore / paragraphNumber
	}
	return base
}
Ejemplo n.º 6
0
func (ve *VideoExtractor) getObjectTag(node *goquery.Selection) video {
	childEmbedTag := node.Find("embed")
	if ve.candidates.Has(childEmbedTag) {
		ve.candidates.Remove(childEmbedTag)
	}
	srcNode := node.Find(`param[name="movie"]`)
	if srcNode == nil || srcNode.Length() == 0 {
		return video{}
	}

	src, _ := srcNode.Attr("value")
	provider := ve.getProvider(src)
	if provider == "" {
		return video{}
	}
	video := ve.getVideo(node)
	video.provider = provider
	video.src = src
	return video
}