func (p Parser) delAttr(selection *goquery.Selection, attr string) { idx := p.indexOfAttribute(selection, attr) if idx > -1 { node := selection.Get(0) node.Attr = append(node.Attr[:idx], node.Attr[idx+1:]...) } }
func (c *Cleaner) replaceWithPara(div *goquery.Selection) { if div.Size() > 0 { node := div.Get(0) node.Data = atom.P.String() node.DataAtom = atom.P } }
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
//a lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to //boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs //so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it func (this *contentExtractor) isBoostable(node *goquery.Selection) bool { stepsAway := 0 next := node.Next() for next != nil && stepsAway < node.Siblings().Length() { currentNodeTag := node.Get(0).DataAtom.String() if currentNodeTag == "p" { if stepsAway >= 3 { if this.config.debug { log.Println("Next paragraph is too far away, not boosting") } return false } paraText := node.Text() ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, paraText) if ws.stopWordCount > 5 { if this.config.debug { log.Println("We're gonna boost this node, seems content") } return true } } stepsAway++ next = next.Next() } return false }
func (p Parser) removeNode(selection *goquery.Selection) { if selection != nil { node := selection.Get(0) if node != nil && node.Parent != nil { node.Parent.RemoveChild(node) } } }
func (p Parser) indexOfAttribute(selection *goquery.Selection, attr string) int { node := selection.Get(0) for i, a := range node.Attr { if a.Key == attr { return i } } return -1 }
func (this *contentExtractor) isNodescoreThresholdMet(node *goquery.Selection, e *goquery.Selection) bool { topNodeScore := this.getNodeGravityScore(node) currentNodeScore := this.getNodeGravityScore(e) threasholdScore := float64(topNodeScore) * 0.08 if (float64(currentNodeScore) < threasholdScore) && e.Get(0).DataAtom.String() != "td" { return false } return true }
func (ve *VideoExtractor) getVideo(node *goquery.Selection) video { src := ve.getSrc(node) video := video{ embedCode: ve.getEmbedCode(node), embedType: node.Get(0).DataAtom.String(), width: ve.getWidth(node), height: ve.getHeight(node), src: src, provider: ve.getProvider(src), } return video }
func (this *contentExtractor) isTableAndNoParaExist(selection *goquery.Selection) bool { subParagraph := selection.Find("p") subParagraph.Each(func(i int, s *goquery.Selection) { txt := s.Text() if len(txt) < 25 { node := s.Get(0) parent := node.Parent parent.RemoveChild(node) } }) subParagraph2 := selection.Find("p") if subParagraph2.Length() == 0 && selection.Get(0).DataAtom.String() != "td" { return true } return false }
func (p Parser) setAttr(selection *goquery.Selection, attr string, value string) { if selection.Size() > 0 { node := selection.Get(0) var attrs []html.Attribute for _, a := range node.Attr { if a.Key != attr { newAttr := new(html.Attribute) newAttr.Key = a.Key newAttr.Val = a.Val attrs = append(attrs, *newAttr) } } newAttr := new(html.Attribute) newAttr.Key = attr newAttr.Val = value attrs = append(attrs, *newAttr) node.Attr = attrs } }