func (this *cleaner) cleanCites(doc *goquery.Document) *goquery.Document { cites := doc.Find("cite") cites.Each(func(i int, s *goquery.Selection) { this.config.parser.removeNode(s) }) return doc }
func (c *Cleaner) cleanDivs(doc *goquery.Document) *goquery.Document { frames := make(map[string]int) framesNodes := make(map[string]*list.List) divs := doc.Find("div") divs.Each(func(i int, s *goquery.Selection) { children := s.Children() if children.Size() == 0 { text := s.Text() text = strings.Trim(text, " ") text = strings.Trim(text, "\t") text = strings.ToLower(text) frames[text]++ if framesNodes[text] == nil { framesNodes[text] = list.New() } framesNodes[text].PushBack(s) } }) for text, freq := range frames { if freq > 1 { selections := framesNodes[text] for s := selections.Front(); s != nil; s = s.Next() { selection := s.Value.(*goquery.Selection) c.config.parser.removeNode(selection) } } } return doc }
func (this *cleaner) cleanBadTags(doc *goquery.Document) *goquery.Document { body := doc.Find("body") children := body.Children() selectors := []string{"id", "class", "name"} for _, selector := range selectors { children.Each(func(i int, s *goquery.Selection) { naughtyList := s.Find("*[" + selector + "]") cont := 0 naughtyList.Each(func(j int, e *goquery.Selection) { attribute, _ := e.Attr(selector) if this.matchNodeRegEx(attribute, REMOVENODES_RE) { if this.config.debug { log.Printf("Cleaning: Removing node with %s: %s\n", selector, this.config.parser.name(selector, e)) } this.config.parser.removeNode(e) cont++ } }) if this.config.debug && cont > 0 { log.Printf("%d naughty %s elements found", cont, selector) } }) } return doc }
func (this *cleaner) cleanAside(doc *goquery.Document) *goquery.Document { aside := doc.Find("aside") aside.Each(func(i int, s *goquery.Selection) { this.config.parser.removeNode(s) }) return doc }
func (this *cleaner) cleanFooter(doc *goquery.Document) *goquery.Document { footer := doc.Find("footer") footer.Each(func(i int, s *goquery.Selection) { this.config.parser.removeNode(s) }) return doc }
func (c *Cleaner) removeTags(doc *goquery.Document, tags *[]string) *goquery.Document { for _, tag := range *tags { node := doc.Find(tag) node.Each(func(i int, s *goquery.Selection) { c.config.parser.removeNode(s) }) } return doc }
func (c *Cleaner) cleanArticleTags(doc *goquery.Document) *goquery.Document { tags := [3]string{"id", "name", "class"} articles := doc.Find("article") articles.Each(func(i int, s *goquery.Selection) { for _, tag := range tags { c.config.parser.delAttr(s, tag) } }) return doc }
func (c *Cleaner) cleanParaSpans(doc *goquery.Document) *goquery.Document { spans := doc.Find("span") spans.Each(func(i int, s *goquery.Selection) { parent := s.Parent() if parent != nil && parent.Length() > 0 && parent.Get(0).DataAtom == atom.P { node := s.Get(0) node.Data = s.Text() node.Type = html.TextNode } }) return doc }
func (c *Cleaner) cleanEMTags(doc *goquery.Document) *goquery.Document { ems := doc.Find("em") ems.Each(func(i int, s *goquery.Selection) { images := s.Find("img") if images.Length() == 0 { c.config.parser.dropTag(s) } }) if c.config.debug { log.Printf("Cleaning %d EM tags\n", ems.Size()) } return doc }
//returns a list of nodes we want to search on like paragraphs and tables func (this *contentExtractor) nodesToCheck(doc *goquery.Document) []*goquery.Selection { output := make([]*goquery.Selection, 0) tags := []string{"p", "pre", "td"} for _, tag := range tags { selections := doc.Children().Find(tag) if selections != nil { selections.Each(func(i int, s *goquery.Selection) { output = append(output, s) }) } } return output }
func (c *Cleaner) removeScriptsStyle(doc *goquery.Document) *goquery.Document { if c.config.debug { log.Println("Starting to remove script tags") } count := 0 // remove scripts := doc.Find("script,noscript,style") scripts.Each(func(i int, s *goquery.Selection) { c.config.parser.removeNode(s) count++ }) if c.config.debug && count > 0 { log.Printf("Removed %d script and style tags\n", scripts.Size()) } return doc }
func (c *Cleaner) dropCaps(doc *goquery.Document) *goquery.Document { items := doc.Find("span") count := 0 // remove items.Each(func(i int, s *goquery.Selection) { attribute, exists := s.Attr("class") if exists && (strings.Contains(attribute, "dropcap") || strings.Contains(attribute, "drop_cap")) { c.config.parser.dropTag(s) count++ } }) if c.config.debug && count > 0 { log.Printf("Cleaned %d dropcap tags\n", count) } return doc }
func (this *cleaner) removeScriptsStyle(doc *goquery.Document) *goquery.Document { if this.config.debug { log.Println("Starting to remove script tags") } scripts := doc.Find("script,noscript,style") scripts.Each(func(i int, s *goquery.Selection) { this.config.parser.removeNode(s) }) if this.config.debug { log.Printf("Removed %d script and style tags\n", scripts.Size()) } //remove comments :) How???? return doc }
func (this *cleaner) removeNodesRegEx(doc *goquery.Document, pattern *regexp.Regexp) *goquery.Document { selectors := [3]string{"id", "class", "name"} for _, selector := range selectors { naughtyList := doc.Find("*[" + selector + "]") cont := 0 naughtyList.Each(func(i int, s *goquery.Selection) { attribute, _ := s.Attr(selector) if this.matchNodeRegEx(attribute, pattern) { cont++ this.config.parser.removeNode(s) } }) if this.config.debug { log.Printf("regExRemoveNodes %d %s elements found against pattern %s\n", cont, selector, pattern.String()) } } return doc }
func (c *Cleaner) cleanBadTags(doc *goquery.Document, pattern *regexp.Regexp, selectors *[]string) *goquery.Document { body := doc.Find("html") children := body.Children() children.Each(func(i int, s *goquery.Selection) { for _, selector := range *selectors { naughtyList := s.Find("*[" + selector + "]") count := 0 naughtyList.Each(func(j int, node *goquery.Selection) { attribute, _ := node.Attr(selector) if pattern.MatchString(attribute) { if c.config.debug { log.Printf("Cleaning: Removing node with %s: %s\n", selector, c.config.parser.name(selector, node)) } c.config.parser.removeNode(node) count++ } }) if c.config.debug && count > 0 { log.Printf("%d naughty %s elements found", count, selector) } } }) return doc }
func (c *Cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document { if c.config.debug { log.Println("Starting to replace bad divs...") } badDivs := 0 convertedTextNodes := 0 divs := doc.Find(domType) divs.Each(func(i int, div *goquery.Selection) { divHTML, _ := div.Html() if divToPElementsPattern.Match([]byte(divHTML)) { c.replaceWithPara(div) badDivs++ } else { var replacementText []string nodesToRemove := list.New() children := div.Contents() if c.config.debug { log.Printf("Found %d children of div\n", children.Size()) } children.EachWithBreak(func(i int, kid *goquery.Selection) bool { text := kid.Text() kidNode := kid.Get(0) tag := kidNode.Data if tag == text { tag = "#text" } if tag == "#text" { text = strings.Replace(text, "\n", "", -1) text = tabsRegEx.ReplaceAllString(text, "") if text == "" { return true } if len(text) > 1 { prev := kidNode.PrevSibling if c.config.debug { log.Printf("PARENT CLASS: %s NODENAME: %s\n", c.config.parser.name("class", div), tag) log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1)) } if prev != nil && prev.DataAtom == atom.A { nodeSelection := kid.HasNodes(prev) html, _ := nodeSelection.Html() replacementText = append(replacementText, html) if c.config.debug { log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html) } } replacementText = append(replacementText, text) nodesToRemove.PushBack(kidNode) convertedTextNodes++ } } return true }) newNode := new(html.Node) newNode.Type = html.ElementNode newNode.Data = strings.Join(replacementText, "") newNode.DataAtom = atom.P div.First().AddNodes(newNode) for s := nodesToRemove.Front(); s != nil; s = s.Next() { node := s.Value.(*html.Node) if node != nil && node.Parent != nil { node.Parent.RemoveChild(node) } } } }) if c.config.debug { log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes) } return doc }