//checks the density of links within a node, is there not much text and most of it contains bad links? //if so it's no good func (this *contentExtractor) isHighLinkDensity(node *goquery.Selection) bool { links := node.Find("a") if links == nil || links.Size() == 0 { return false } text := node.Text() words := strings.Split(text, " ") nwords := len(words) sb := make([]string, 0) links.Each(func(i int, s *goquery.Selection) { linkText := s.Text() sb = append(sb, linkText) }) linkText := strings.Join(sb, "") linkWords := strings.Split(linkText, " ") nlinkWords := len(linkWords) nlinks := links.Size() linkDivisor := float64(nlinkWords) / float64(nwords) score := linkDivisor * float64(nlinks) if this.config.debug { logText := "" if len(node.Text()) >= 51 { logText = node.Text()[0:50] } else { logText = node.Text() } log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText) } if score > 1.0 { return true } return false }
func (p Parser) name(selector string, selection *goquery.Selection) string { value, exists := selection.Attr(selector) if exists { return value } return "" }
func (ve *VideoExtractor) getSrc(node *goquery.Selection) string { value, exists := node.Attr("src") if exists { return value } return "" }
func score(tag *goquery.Selection) int { src, _ := tag.Attr("src") if src == "" { src, _ = tag.Attr("data-src") } if src == "" { src, _ = tag.Attr("data-lazy-src") } if src == "" { return -1 } tagScore := 0 for rule, score := range rules { if rule.MatchString(src) { tagScore += score } } alt, exists := tag.Attr("alt") if exists { if strings.Contains(alt, "thumbnail") { tagScore-- } } id, exists := tag.Attr("id") if exists { if id == "fbPhotoImage" { tagScore++ } } return tagScore }
func (c *Cleaner) replaceWithPara(div *goquery.Selection) { if div.Size() > 0 { node := div.Get(0) node.Data = atom.P.String() node.DataAtom = atom.P } }
func (p Parser) delAttr(selection *goquery.Selection, attr string) { idx := p.indexOfAttribute(selection, attr) if idx > -1 { node := selection.Get(0) node.Attr = append(node.Attr[:idx], node.Attr[idx+1:]...) } }
func (p Parser) dropTag(selection *goquery.Selection) { selection.Each(func(i int, s *goquery.Selection) { node := s.Get(0) node.Data = s.Text() node.Type = html.TextNode }) }
func (p Parser) removeNode(selection *goquery.Selection) { if selection != nil { node := selection.Get(0) if node != nil && node.Parent != nil { node.Parent.RemoveChild(node) } } }
func (ve *VideoExtractor) getHeight(node *goquery.Selection) int { value, exists := node.Attr("height") if exists { nvalue, _ := strconv.Atoi(value) return nvalue } return 0 }
func (this *contentExtractor) isNodescoreThresholdMet(node *goquery.Selection, e *goquery.Selection) bool { topNodeScore := this.getNodeGravityScore(node) currentNodeScore := this.getNodeGravityScore(e) threasholdScore := float64(topNodeScore) * 0.08 if (float64(currentNodeScore) < threasholdScore) && e.Get(0).DataAtom.String() != "td" { return false } return true }
func (p Parser) indexOfAttribute(selection *goquery.Selection, attr string) int { node := selection.Get(0) for i, a := range node.Attr { if a.Key == attr { return i } } return -1 }
func (ve *VideoExtractor) getEmbedTag(node *goquery.Selection) video { parent := node.Parent() if parent != nil { parentTag := parent.Get(0).DataAtom.String() if parentTag == "object" { return ve.getObjectTag(node) } } return ve.getVideo(node) }
func (extr *ContentExtractor) walkSiblings(node *goquery.Selection) []*goquery.Selection { currentSibling := node.Prev() var b []*goquery.Selection for currentSibling.Length() != 0 { b = append(b, currentSibling) previousSibling := currentSibling.Prev() currentSibling = previousSibling } return b }
func (p Parser) getElementsByTags(div *goquery.Selection, tags []string) *goquery.Selection { selection := new(goquery.Selection) for _, tag := range tags { selections := div.Find(tag) if selections != nil { selection = selection.Union(selections) } } return selection }
func (this *contentExtractor) walkSiblings(node *goquery.Selection) []*goquery.Selection { currentSibling := node.Prev() b := make([]*goquery.Selection, 0) for currentSibling.Length() != 0 { b = append(b, currentSibling) previousSibling := currentSibling.Prev() currentSibling = previousSibling } return b }
func (this *contentExtractor) getNodeGravityScore(node *goquery.Selection) int { grvScoreString, exists := node.Attr("gravityScore") if !exists { return 0 } grvScore, err := strconv.Atoi(grvScoreString) if err != nil { return 0 } return grvScore }
func (ve *VideoExtractor) getVideo(node *goquery.Selection) video { src := ve.getSrc(node) video := video{ embedCode: ve.getEmbedCode(node), embedType: node.Get(0).DataAtom.String(), width: ve.getWidth(node), height: ve.getHeight(node), src: src, provider: ve.getProvider(src), } return video }
//adds a score to the gravityScore Attribute we put on divs //we'll get the current score then add the score we're passing in to the current func (extr *ContentExtractor) updateScore(node *goquery.Selection, addToScore int) { currentScore := 0 var err error scoreString, _ := node.Attr("gravityScore") if scoreString != "" { currentScore, err = strconv.Atoi(scoreString) if err != nil { currentScore = 0 } } newScore := currentScore + addToScore extr.config.parser.setAttr(node, "gravityScore", strconv.Itoa(newScore)) }
//stores how many decent nodes are under a parent node func (this *contentExtractor) updateNodeCount(node *goquery.Selection, addToCount int) { currentScore := 0 var err error scoreString, _ := node.Attr("gravityNodes") if scoreString != "" { currentScore, err = strconv.Atoi(scoreString) if err != nil { currentScore = 0 } } newScore := currentScore + addToCount this.config.parser.setAttr(node, "gravityNodes", strconv.Itoa(newScore)) }
//a lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to //boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs //so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it func (this *contentExtractor) isBoostable(node *goquery.Selection) bool { stepsAway := 0 next := node.Next() for next != nil && stepsAway < node.Siblings().Length() { currentNodeTag := node.Get(0).DataAtom.String() if currentNodeTag == "p" { if stepsAway >= 3 { if this.config.debug { log.Println("Next paragraph is too far away, not boosting") } return false } paraText := node.Text() ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, paraText) if ws.stopWordCount > 5 { if this.config.debug { log.Println("We're gonna boost this node, seems content") } return true } } stepsAway++ next = next.Next() } return false }
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
//we could have long articles that have tons of paragraphs so if we tried to calculate the base score against //the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring //of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of //100 then 100 should be our base. func (this *contentExtractor) getSiblingsScore(topNode *goquery.Selection) int { base := 100000 paragraphNumber := 0 paragraphScore := 0 nodesToCheck := topNode.Find("p") nodesToCheck.Each(func(i int, s *goquery.Selection) { textNode := s.Text() ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, textNode) highLinkDensity := this.isHighLinkDensity(s) if ws.stopWordCount > 2 && !highLinkDensity { paragraphNumber++ paragraphScore += ws.stopWordCount } }) if paragraphNumber > 0 { base = paragraphScore / paragraphNumber } return base }
func (p Parser) setAttr(selection *goquery.Selection, attr string, value string) { if selection.Size() > 0 { node := selection.Get(0) var attrs []html.Attribute for _, a := range node.Attr { if a.Key != attr { newAttr := new(html.Attribute) newAttr.Key = a.Key newAttr.Val = a.Val attrs = append(attrs, *newAttr) } } newAttr := new(html.Attribute) newAttr.Key = attr newAttr.Val = value attrs = append(attrs, *newAttr) node.Attr = attrs } }
func (ve *VideoExtractor) getObjectTag(node *goquery.Selection) video { childEmbedTag := node.Find("embed") if ve.candidates.Has(childEmbedTag) { ve.candidates.Remove(childEmbedTag) } srcNode := node.Find(`param[name="movie"]`) if srcNode == nil || srcNode.Length() == 0 { return video{} } src, _ := srcNode.Attr("value") provider := ve.getProvider(src) if provider == "" { return video{} } video := ve.getVideo(node) video.provider = provider video.src = src return video }
func (ve *VideoExtractor) GetVideos(article *Article) *set.Set { doc := article.Doc var nodes *goquery.Selection for _, videoTag := range videoTags { tmpNodes := doc.Find(videoTag) if nodes == nil { nodes = tmpNodes } else { nodes.Union(tmpNodes) } } nodes.Each(func(i int, node *goquery.Selection) { tag := node.Get(0).DataAtom.String() var movie video switch tag { case "video": movie = ve.getVideoTag(node) break case "embed": movie = ve.getEmbedTag(node) break case "object": movie = ve.getObjectTag(node) break case "iframe": movie = ve.getIFrame(node) break default: { } } if movie.src != "" { ve.movies.Add(movie) } }) return ve.movies }
//adds any siblings that may have a decent score to this node func (this *contentExtractor) addSiblings(topNode *goquery.Selection) *goquery.Selection { if this.config.debug { log.Println("Starting to add siblings") } baselinescoreSiblingsPara := this.getSiblingsScore(topNode) results := this.walkSiblings(topNode) for _, currentNode := range results { ps := this.getSiblingsContent(currentNode, float64(baselinescoreSiblingsPara)) for _, p := range ps { nodes := make([]*html.Node, len(topNode.Nodes)+1) nodes[0] = p.Get(0) for i, node := range topNode.Nodes { nodes[i+1] = node } topNode.Nodes = nodes } } return topNode }
func (this *contentExtractor) isTableAndNoParaExist(selection *goquery.Selection) bool { subParagraph := selection.Find("p") subParagraph.Each(func(i int, s *goquery.Selection) { txt := s.Text() if len(txt) < 25 { node := s.Get(0) parent := node.Parent parent.RemoveChild(node) } }) subParagraph2 := selection.Find("p") if subParagraph2.Length() == 0 && selection.Get(0).DataAtom.String() != "td" { return true } return false }
func (ve *VideoExtractor) getEmbedCode(node *goquery.Selection) string { return node.Text() }
func (p Parser) clear(selection *goquery.Selection) { selection.Nodes = make([]*html.Node, 0) }