func (this *cleaner) replaceWithPara(div *goquery.Selection) { if div.Size() > 0 { node := div.Get(0) node.Data = atom.P.String() node.DataAtom = atom.P } }
//a lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to //boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs //so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it func (this *contentExtractor) isBoostable(node *goquery.Selection) bool { stepsAway := 0 next := node.Next() for next != nil && stepsAway < node.Siblings().Length() { currentNodeTag := node.Get(0).DataAtom.String() if currentNodeTag == "p" { if stepsAway >= 3 { if this.config.debug { log.Println("Next paragraph is too far away, not boosting") } return false } paraText := node.Text() ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, paraText) if ws.stopWordCount > 5 { if this.config.debug { log.Println("We're gonna boost this node, seems content") } return true } } stepsAway++ next = next.Next() } return false }
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
// node returns a string representation of the selection. func node(i int, s *goquery.Selection) string { switch node := s.Get(0); { case node.Data == "h1": return fmt.Sprintf(" \033[%dm# %s\033[0m\n\n", blue, text(s)) case node.Data == "h2": return fmt.Sprintf(" \033[%dm## %s\033[0m\n\n", blue, text(s)) case node.Data == "h3": return fmt.Sprintf(" \033[%dm### %s\033[0m\n\n", blue, text(s)) case node.Data == "p": return fmt.Sprintf("\033[%dm%s\033[0m\n\n", none, indent(text(s), 1)) case node.Data == "pre" || s.HasClass("highlight"): return fmt.Sprintf("\033[1m%s\033[0m\n\n", indent(text(s), 2)) case node.Data == "a": return fmt.Sprintf("%s (%s) ", s.Text(), s.AttrOr("href", "missing link")) case node.Data == "li": return fmt.Sprintf(" • %s\n", contents(s)) case node.Data == "ul": return fmt.Sprintf("%s\n", nodes(s)) case node.Data == "code": return fmt.Sprintf("\033[1m%s\033[0m ", s.Text()) case node.Type == html.TextNode: return strings.TrimSpace(node.Data) default: return "" } }
func (this *parser) delAttr(selection *goquery.Selection, attr string) { idx := this.indexOfAttribute(selection, attr) if idx > -1 { node := selection.Get(0) node.Attr = append(node.Attr[:idx], node.Attr[idx+1:]...) } }
func (this *parser) removeNode(selection *goquery.Selection) { if selection != nil { node := selection.Get(0) if node != nil && node.Parent != nil { node.Parent.RemoveChild(node) } } }
func (this *contentExtractor) isNodescoreThresholdMet(node *goquery.Selection, e *goquery.Selection) bool { topNodeScore := this.getNodeGravityScore(node) currentNodeScore := this.getNodeGravityScore(e) threasholdScore := float64(topNodeScore) * 0.08 if (float64(currentNodeScore) < threasholdScore) && e.Get(0).DataAtom.String() != "td" { return false } return true }
func (this *parser) indexOfAttribute(selection *goquery.Selection, attr string) int { node := selection.Get(0) for i, a := range node.Attr { if a.Key == attr { return i } } return -1 }
func removeNodes(s *goquery.Selection) { s.Each(func(i int, s *goquery.Selection) { parent := s.Parent() if parent.Length() == 0 { // TODO??? } else { parent.Get(0).RemoveChild(s.Get(0)) } }) }
func (ve *VideoExtractor) getVideo(node *goquery.Selection) video { src := ve.getSrc(node) video := video{ embedCode: ve.getEmbedCode(node), embedType: node.Get(0).DataAtom.String(), width: ve.getWidth(node), height: ve.getHeight(node), src: src, provider: ve.getProvider(src), } return video }
func (this *contentExtractor) isTableAndNoParaExist(selection *goquery.Selection) bool { subParagraph := selection.Find("p") subParagraph.Each(func(i int, s *goquery.Selection) { txt := s.Text() if len(txt) < 25 { node := s.Get(0) parent := node.Parent parent.RemoveChild(node) } }) subParagraph2 := selection.Find("p") if subParagraph2.Length() == 0 && selection.Get(0).DataAtom.String() != "td" { return true } return false }
func (this *parser) setAttr(selection *goquery.Selection, attr string, value string) { if selection.Size() > 0 { node := selection.Get(0) attrs := make([]html.Attribute, 0) for _, a := range node.Attr { if a.Key != attr { newAttr := new(html.Attribute) newAttr.Key = a.Key newAttr.Val = a.Val attrs = append(attrs, *newAttr) } } newAttr := new(html.Attribute) newAttr.Key = attr newAttr.Val = value attrs = append(attrs, *newAttr) node.Attr = attrs } }
func extractData(tds *goquery.Selection, parsed_url *url.URL, visited_urls map[string]string, result_chan chan string) { val := tds.Get(0).Attr[0] new_path, err := url.Parse(val.Val) if err != nil { panic(err) } recomposed_url := parsed_url.ResolveReference(new_path) if _, ok := visited_urls[recomposed_url.String()]; !ok { var full_url = recomposed_url.String() if !strings.Contains(recomposed_url.Path, ".") { visited_urls[full_url] = full_url newSearch(full_url, &visited_urls, result_chan) } else { result_chan <- full_url } } }
func (d *Document) cleanConditionally(s *goquery.Selection, selector string) { if !d.CleanConditionally { return } s.Find(selector).Each(func(i int, s *goquery.Selection) { node := s.Get(0) weight := float32(d.classWeight(s)) contentScore := float32(0) if c, ok := d.candidates[node]; ok { contentScore = c.score } if weight+contentScore < 0 { removeNodes(s) Logger.Printf("Conditionally cleaned %s%s with weight %f and content score %f\n", node.Data, getName(s), weight, contentScore) return } text := s.Text() if strings.Count(text, ",") < 10 { counts := map[string]int{ "p": s.Find("p").Length(), "img": s.Find("img").Length(), "li": s.Find("li").Length() - 100, "a": s.Find("a").Length(), "embed": s.Find("embed").Length(), "input": s.Find("input").Length(), } contentLength := len(strings.TrimSpace(text)) linkDensity := d.getLinkDensity(s) remove := false reason := "" if counts["img"] > counts["p"] { reason = "too many images" remove = true } else if counts["li"] > counts["p"] && !s.Is("ul,ol") { reason = "more <li>s than <p>s" remove = true } else if counts["input"] > int(counts["p"]/3.0) { reason = "less than 3x <p>s than <input>s" remove = true } else if contentLength < d.MinTextLength && (counts["img"] == 0 || counts["img"] > 2) { reason = "too short content length without a single image" remove = true } else if weight < 25 && linkDensity > 0.2 { reason = fmt.Sprintf("too many links for its weight (%f)", weight) remove = true } else if weight >= 25 && linkDensity > 0.5 { reason = fmt.Sprintf("too many links for its weight (%f)", weight) remove = true } else if (counts["embed"] == 1 && contentLength < 75) || counts["embed"] > 1 { reason = "<embed>s with too short a content length, or too many <embed>s" remove = true } if remove { Logger.Printf("Conditionally cleaned %s%s with weight %f and content score %f because it has %s\n", node.Data, getName(s), weight, contentScore, reason) removeNodes(s) } } }) }