Beispiel #1
0
func (this *cleaner) cleanCites(doc *goquery.Document) *goquery.Document {
	cites := doc.Find("cite")
	cites.Each(func(i int, s *goquery.Selection) {
		this.config.parser.removeNode(s)
	})
	return doc
}
Beispiel #2
0
func (c *Cleaner) cleanDivs(doc *goquery.Document) *goquery.Document {
	frames := make(map[string]int)
	framesNodes := make(map[string]*list.List)
	divs := doc.Find("div")
	divs.Each(func(i int, s *goquery.Selection) {
		children := s.Children()
		if children.Size() == 0 {
			text := s.Text()
			text = strings.Trim(text, " ")
			text = strings.Trim(text, "\t")
			text = strings.ToLower(text)
			frames[text]++
			if framesNodes[text] == nil {
				framesNodes[text] = list.New()
			}
			framesNodes[text].PushBack(s)
		}
	})
	for text, freq := range frames {
		if freq > 1 {
			selections := framesNodes[text]
			for s := selections.Front(); s != nil; s = s.Next() {
				selection := s.Value.(*goquery.Selection)
				c.config.parser.removeNode(selection)
			}
		}
	}
	return doc
}
Beispiel #3
0
func (this *cleaner) cleanBadTags(doc *goquery.Document) *goquery.Document {
	body := doc.Find("body")
	children := body.Children()
	selectors := []string{"id", "class", "name"}
	for _, selector := range selectors {
		children.Each(func(i int, s *goquery.Selection) {
			naughtyList := s.Find("*[" + selector + "]")
			cont := 0
			naughtyList.Each(func(j int, e *goquery.Selection) {
				attribute, _ := e.Attr(selector)
				if this.matchNodeRegEx(attribute, REMOVENODES_RE) {
					if this.config.debug {

						log.Printf("Cleaning: Removing node with %s: %s\n", selector, this.config.parser.name(selector, e))
					}
					this.config.parser.removeNode(e)
					cont++
				}
			})
			if this.config.debug && cont > 0 {
				log.Printf("%d naughty %s elements found", cont, selector)
			}
		})
	}
	return doc
}
Beispiel #4
0
func (this *cleaner) cleanAside(doc *goquery.Document) *goquery.Document {
	aside := doc.Find("aside")
	aside.Each(func(i int, s *goquery.Selection) {
		this.config.parser.removeNode(s)
	})
	return doc
}
Beispiel #5
0
func (this *cleaner) cleanFooter(doc *goquery.Document) *goquery.Document {
	footer := doc.Find("footer")
	footer.Each(func(i int, s *goquery.Selection) {
		this.config.parser.removeNode(s)
	})
	return doc
}
Beispiel #6
0
func (c *Cleaner) removeTags(doc *goquery.Document, tags *[]string) *goquery.Document {
	for _, tag := range *tags {
		node := doc.Find(tag)
		node.Each(func(i int, s *goquery.Selection) {
			c.config.parser.removeNode(s)
		})
	}
	return doc
}
Beispiel #7
0
func (c *Cleaner) cleanArticleTags(doc *goquery.Document) *goquery.Document {
	tags := [3]string{"id", "name", "class"}
	articles := doc.Find("article")
	articles.Each(func(i int, s *goquery.Selection) {
		for _, tag := range tags {
			c.config.parser.delAttr(s, tag)
		}
	})
	return doc
}
Beispiel #8
0
func (c *Cleaner) cleanParaSpans(doc *goquery.Document) *goquery.Document {
	spans := doc.Find("span")
	spans.Each(func(i int, s *goquery.Selection) {
		parent := s.Parent()
		if parent != nil && parent.Length() > 0 && parent.Get(0).DataAtom == atom.P {
			node := s.Get(0)
			node.Data = s.Text()
			node.Type = html.TextNode
		}
	})
	return doc
}
Beispiel #9
0
func (c *Cleaner) cleanEMTags(doc *goquery.Document) *goquery.Document {
	ems := doc.Find("em")
	ems.Each(func(i int, s *goquery.Selection) {
		images := s.Find("img")
		if images.Length() == 0 {
			c.config.parser.dropTag(s)
		}
	})
	if c.config.debug {
		log.Printf("Cleaning %d EM tags\n", ems.Size())
	}
	return doc
}
Beispiel #10
0
//returns a list of nodes we want to search on like paragraphs and tables
func (this *contentExtractor) nodesToCheck(doc *goquery.Document) []*goquery.Selection {
	output := make([]*goquery.Selection, 0)
	tags := []string{"p", "pre", "td"}
	for _, tag := range tags {
		selections := doc.Children().Find(tag)
		if selections != nil {
			selections.Each(func(i int, s *goquery.Selection) {
				output = append(output, s)
			})
		}
	}
	return output
}
Beispiel #11
0
func (c *Cleaner) removeScriptsStyle(doc *goquery.Document) *goquery.Document {
	if c.config.debug {
		log.Println("Starting to remove script tags")
	}
	count := 0 // remove
	scripts := doc.Find("script,noscript,style")
	scripts.Each(func(i int, s *goquery.Selection) {
		c.config.parser.removeNode(s)
		count++
	})
	if c.config.debug && count > 0 {
		log.Printf("Removed %d script and style tags\n", scripts.Size())
	}
	return doc
}
Beispiel #12
0
func (c *Cleaner) dropCaps(doc *goquery.Document) *goquery.Document {
	items := doc.Find("span")
	count := 0 // remove
	items.Each(func(i int, s *goquery.Selection) {
		attribute, exists := s.Attr("class")
		if exists && (strings.Contains(attribute, "dropcap") || strings.Contains(attribute, "drop_cap")) {
			c.config.parser.dropTag(s)
			count++
		}
	})
	if c.config.debug && count > 0 {
		log.Printf("Cleaned %d dropcap tags\n", count)
	}
	return doc
}
Beispiel #13
0
func (this *cleaner) removeScriptsStyle(doc *goquery.Document) *goquery.Document {
	if this.config.debug {
		log.Println("Starting to remove script tags")
	}
	scripts := doc.Find("script,noscript,style")
	scripts.Each(func(i int, s *goquery.Selection) {
		this.config.parser.removeNode(s)
	})
	if this.config.debug {
		log.Printf("Removed %d script and style tags\n", scripts.Size())
	}

	//remove comments :) How????
	return doc
}
Beispiel #14
0
func (this *cleaner) removeNodesRegEx(doc *goquery.Document, pattern *regexp.Regexp) *goquery.Document {
	selectors := [3]string{"id", "class", "name"}
	for _, selector := range selectors {
		naughtyList := doc.Find("*[" + selector + "]")
		cont := 0
		naughtyList.Each(func(i int, s *goquery.Selection) {
			attribute, _ := s.Attr(selector)
			if this.matchNodeRegEx(attribute, pattern) {
				cont++
				this.config.parser.removeNode(s)
			}
		})

		if this.config.debug {
			log.Printf("regExRemoveNodes %d %s elements found against pattern %s\n", cont, selector, pattern.String())
		}
	}
	return doc
}
Beispiel #15
0
func (c *Cleaner) cleanBadTags(doc *goquery.Document, pattern *regexp.Regexp, selectors *[]string) *goquery.Document {
	body := doc.Find("html")
	children := body.Children()
	children.Each(func(i int, s *goquery.Selection) {
		for _, selector := range *selectors {
			naughtyList := s.Find("*[" + selector + "]")
			count := 0
			naughtyList.Each(func(j int, node *goquery.Selection) {
				attribute, _ := node.Attr(selector)
				if pattern.MatchString(attribute) {
					if c.config.debug {
						log.Printf("Cleaning: Removing node with %s: %s\n", selector, c.config.parser.name(selector, node))
					}
					c.config.parser.removeNode(node)
					count++
				}
			})
			if c.config.debug && count > 0 {
				log.Printf("%d naughty %s elements found", count, selector)
			}
		}
	})
	return doc
}
Beispiel #16
0
func (c *Cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document {
	if c.config.debug {
		log.Println("Starting to replace bad divs...")
	}
	badDivs := 0
	convertedTextNodes := 0
	divs := doc.Find(domType)

	divs.Each(func(i int, div *goquery.Selection) {
		divHTML, _ := div.Html()
		if divToPElementsPattern.Match([]byte(divHTML)) {
			c.replaceWithPara(div)
			badDivs++
		} else {
			var replacementText []string
			nodesToRemove := list.New()
			children := div.Contents()
			if c.config.debug {
				log.Printf("Found %d children of div\n", children.Size())
			}
			children.EachWithBreak(func(i int, kid *goquery.Selection) bool {
				text := kid.Text()
				kidNode := kid.Get(0)
				tag := kidNode.Data
				if tag == text {
					tag = "#text"
				}
				if tag == "#text" {
					text = strings.Replace(text, "\n", "", -1)
					text = tabsRegEx.ReplaceAllString(text, "")
					if text == "" {
						return true
					}
					if len(text) > 1 {
						prev := kidNode.PrevSibling
						if c.config.debug {
							log.Printf("PARENT CLASS: %s NODENAME: %s\n", c.config.parser.name("class", div), tag)
							log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1))
						}
						if prev != nil && prev.DataAtom == atom.A {
							nodeSelection := kid.HasNodes(prev)
							html, _ := nodeSelection.Html()
							replacementText = append(replacementText, html)
							if c.config.debug {
								log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html)
							}
						}
						replacementText = append(replacementText, text)
						nodesToRemove.PushBack(kidNode)
						convertedTextNodes++
					}

				}
				return true
			})

			newNode := new(html.Node)
			newNode.Type = html.ElementNode
			newNode.Data = strings.Join(replacementText, "")
			newNode.DataAtom = atom.P
			div.First().AddNodes(newNode)

			for s := nodesToRemove.Front(); s != nil; s = s.Next() {
				node := s.Value.(*html.Node)
				if node != nil && node.Parent != nil {
					node.Parent.RemoveChild(node)
				}
			}
		}
	})
	if c.config.debug {
		log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes)
	}
	return doc

}