Example #1
1
// cleansDom performs brute reduction and simplification
//
func cleanseDom(n *html.Node, lvl int) {

	n.Attr = removeAttr(n.Attr, unwantedAttrs)

	// Children
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		cleanseDom(c, lvl+1)
	}

	if directlyRemoveUnwanted {
		removeUnwanted(n)
	} else {
		convertUnwanted(n)
	}

	// ---

	convertExotic(n)

	// one time text normalization
	if n.Type == html.TextNode {
		n.Data = stringspb.NormalizeInnerWhitespace(n.Data)
	}

}
func (m *minificationHTML) toDiv(node *html.Node) (*html.Node, error) {
	node.DataAtom = atom.Div
	node.Data = "div"
	node.Attr = nil

	return m.parseChildren(node)
}
Example #3
0
func FindTitleAndBody_MK(node *html.Node) (*html.Node, *html.Node) {
	var title, fulltext *html.Node

	if node.Type == html.ElementNode {
		for _, tag := range node.Attr {
			if tag.Key == "class" {
				if tag.Val == "content" {
					title = FindTitleMK(node)
					node.Data = "body"
					fulltext = node
					break
				}
			}
		}
	}

	for c := node.FirstChild; c != nil; c = c.NextSibling {
		ptitle, pfulltext := FindTitleAndBody_MK(c)
		if ptitle != nil {
			title = ptitle
			title.Data = "title"
		}
		if pfulltext != nil {
			fulltext = pfulltext
		}
		if title != nil && fulltext != nil {
			break
		}
	}
	return title, fulltext

}
Example #4
0
func (u *parserUtils) mergeNodes(parent, prev, next *html.Node, addSeparator bool) *html.Node {
	prevText := prev != nil && prev.Type == html.TextNode
	nextText := next != nil && next.Type == html.TextNode
	delim := ""
	if addSeparator {
		delim = " "
	}

	if prevText && nextText {
		prev.Data = prev.Data + delim + next.Data
		parent.RemoveChild(next)
		return prev.NextSibling
	}

	if prevText {
		prev.Data = prev.Data + delim
	} else if nextText {
		next.Data = delim + next.Data
	} else if addSeparator {
		newNode := &html.Node{
			Type: html.TextNode,
			Data: delim}
		parent.InsertBefore(newNode, next)
	}

	return next
}
Example #5
0
// finds article's title and body in ria.ru html style
// works cleary on 15.12.2015
func FindTitleAndBody_Ria(node *html.Node) (*html.Node, *html.Node) {
	var title, fulltext *html.Node

	if node.Type == html.ElementNode {
		for _, tag := range node.Attr {
			if tag.Key == "itemprop" {
				if tag.Val == "articleBody" {
					node.Data = "body"
					fulltext = node
					break
				}
				if tag.Val == "name" {
					node.Data = "title"
					title = node
					break
				}
			}
		}
	}

	for c := node.FirstChild; c != nil; c = c.NextSibling {
		ptitle, pfulltext := FindTitleAndBody_Ria(c)
		if ptitle != nil {
			title = ptitle
		}
		if pfulltext != nil {
			fulltext = pfulltext
		}
		if title != nil && fulltext != nil {
			break
		}
	}
	return title, fulltext

}
Example #6
0
func reIndent(n *html.Node, lvl int) {

	if lvl > cScaffoldLvls && n.Parent == nil {
		bb := dom.PrintSubtree(n)
		_ = bb
		// log.Printf("%s", bb.Bytes())
		hint := ""
		if ml3[n] > 0 {
			hint = "   from ml3"
		}
		log.Print("reIndent: no parent ", hint)
		return
	}

	// Before children processing
	switch n.Type {
	case html.ElementNode:
		if lvl > cScaffoldLvls && n.Parent.Type == html.ElementNode {
			ind := strings.Repeat("\t", lvl-2)
			dom.InsertBefore(n, &html.Node{Type: html.TextNode, Data: "\n" + ind})
		}
	case html.CommentNode:
		dom.InsertBefore(n, &html.Node{Type: html.TextNode, Data: "\n"})
	case html.TextNode:
		n.Data = strings.TrimSpace(n.Data) + " "
		if !strings.HasPrefix(n.Data, ",") && !strings.HasPrefix(n.Data, ".") {
			n.Data = " " + n.Data
		}
		// link texts without trailing space
		if n.Parent != nil && n.Parent.Data == "a" {
			n.Data = strings.TrimSpace(n.Data)
		}
	}

	// Children
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		reIndent(c, lvl+1)
	}

	// After children processing
	switch n.Type {
	case html.ElementNode:
		// I dont know why,
		// but this needs to happend AFTER the children
		if lvl > cScaffoldLvls && n.Parent.Type == html.ElementNode {
			ind := strings.Repeat("\t", lvl-2)
			ind = "\n" + ind
			// link texts without new line
			if n.Data == "a" {
				ind = ""
			}
			if n.LastChild != nil {
				dom.InsertAfter(n.LastChild, &html.Node{Type: html.TextNode, Data: ind})
			}
		}
	}

}
Example #7
0
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
	ps := make([]*goquery.Selection, 0)
	if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
		ps = append(ps, currentSibling)
		return ps
	} else {
		potentialParagraphs := currentSibling.Find("p")
		potentialParagraphs.Each(func(i int, s *goquery.Selection) {
			text := s.Text()
			if len(text) > 0 {
				ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text)
				paragraphScore := ws.stopWordCount
				siblingBaselineScore := 0.30
				highLinkDensity := this.isHighLinkDensity(s)
				score := siblingBaselineScore * baselinescoreSiblingsPara
				if score < float64(paragraphScore) && !highLinkDensity {
					node := new(html.Node)
					node.Type = html.TextNode
					node.Data = text
					node.DataAtom = atom.P
					nodes := make([]*html.Node, 1)
					nodes[0] = node
					newSelection := new(goquery.Selection)
					newSelection.Nodes = nodes
					ps = append(ps, newSelection)
				}
			}

		})
	}
	return ps
}
Example #8
0
func img2Link(img *html.Node) {

	if img.Data == "img" {

		img.Data = "a"
		for i := 0; i < len(img.Attr); i++ {
			if img.Attr[i].Key == "src" {
				img.Attr[i].Key = "href"
			}
		}

		double := closureTextNodeExists(img)
		imgContent := ""
		title := attrX(img.Attr, "title")

		if double {
			imgContent = fmt.Sprintf("[img] %v %v | ",
				"[ctdr]", // content title double removed
				urlBeautify(attrX(img.Attr, "href")))

		} else {
			imgContent = fmt.Sprintf("[img] %v %v | ",
				title,
				urlBeautify(attrX(img.Attr, "href")))
		}

		img.Attr = attrSet(img.Attr, "cfrom", "img")
		nd := dom.Nd("text", imgContent)
		img.AppendChild(nd)
	}

}
Example #9
0
func copyNode(to, from *html.Node) {
	to.Attr = from.Attr
	to.Data = from.Data
	to.DataAtom = from.DataAtom
	to.Namespace = from.Namespace
	to.Type = from.Type
}
Example #10
0
func (m *minificationText) parseText(node *html.Node) (*html.Node, error) {
	next := node.NextSibling
	text := m.processText(node.Data)
	if len(text) != 0 {
		node.Data = text
	} else {
		node.Parent.RemoveChild(node)
	}
	return next, nil
}
Example #11
0
func Nd(ntype string, content ...string) *html.Node {

	nd0 := new(html.Node)

	if ntype == "text" {
		nd0.Type = html.TextNode
		if len(content) > 0 {
			nd0.Data = content[0]
		}
	} else {
		nd0.Type = html.ElementNode
		nd0.Data = ntype
		if len(content) > 0 {
			runtimepb.StackTrace(4)
			log.Printf("Element nodes can't have content")
		}
	}

	return nd0

}
Example #12
0
func walkPrint(w io.Writer, i int, n *html.Node) {
	for ; n != nil; n = n.NextSibling {
		if n.Type == html.TextNode && strings.TrimSpace(n.Data) == "" {
			continue
		}

		d := getData(n)
		isMostChild := getData(n.Parent).Child == n
		if isMostChild {
			w.Write([]byte(`<div style="background: rgba(0, 0, 100, 0.1)">`))
		}
		if d.Chosen || d.ChosenBy {
			color := "rgb(40, 79, 40)"
			if d.ChosenBy {
				color = "rgba(90, 60, 30, 0.8)"
			}
			w.Write([]byte(`<div id="chosen" style="background: ` + color + `;color: #fff">`))
		}
		factor := 0
		if d.Count > 0 {
			factor = d.MaxChild * 100 / d.Count
		}

		if len([]rune(n.Data)) > 40 {
			n.Data = string([]rune(n.Data)[:40])
		}
		if n.Type == html.ElementNode {
			fmt.Fprintf(w, "%v&lt;%v&gt;", strings.Repeat("\t", i), n.Data)
			fmt.Fprintf(w, " (%v/%v = <b>%v%%</b>) - %v\n",
				d.MaxChild,
				d.Count,
				factor,

				n.Attr,
			)
		} else {
			fmt.Fprintf(w, "%v%v\n", strings.Repeat("\t", i), strconv.Quote(ghtml.EscapeString(n.Data)))
		}

		if n.FirstChild != nil {
			walkPrint(w, i+1, n.FirstChild)
		}
		if isMostChild {
			w.Write([]byte(`</div>`))
		}

		if d.Chosen || d.ChosenBy {
			w.Write([]byte("</div>"))
		}

	}
}
Example #13
0
func TestParseATagNoHref(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected1 := 0
	val1 := page.links.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}
Example #14
0
// CloneNode makes a copy of a Node with all descendants.
func CloneNode(n *exphtml.Node) *exphtml.Node {
	clone := new(exphtml.Node)
	clone.Type = n.Type
	clone.DataAtom = n.DataAtom
	clone.Data = n.Data
	clone.Attr = make([]exphtml.Attribute, len(n.Attr))
	copy(clone.Attr, n.Attr)
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		nc := CloneNode(c)
		clone.AppendChild(nc)
	}
	return clone
}
Example #15
0
func TestParseScriptTagNoSrc(t *testing.T) {
	node := new(html.Node)
	node.Data = "script"

	page := newWebPage(startUrl)
	page.parseScriptTag(node)

	expected1 := 0
	val1 := page.scriptFiles.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}
Example #16
0
func removeEmptyNodes(n *html.Node, lvl int) {

	// children
	cc := []*html.Node{}
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		cc = append(cc, c)
	}
	for _, c := range cc {
		removeEmptyNodes(c, lvl+1)
	}

	// processing
	// empty element nodes
	if n.Type == html.ElementNode && n.Data == "img" {
		src := attrX(n.Attr, "src")
		if src == "" {
			n.Parent.RemoveChild(n)
		}
	}

	if n.Type == html.ElementNode && n.FirstChild == nil && n.Data == "a" {
		href := attrX(n.Attr, "href")
		if href == "#" || href == "" {
			n.Parent.RemoveChild(n)
		}
	}

	if n.Type == html.ElementNode && n.FirstChild == nil &&
		(n.Data == "em" || n.Data == "strong") {
		n.Parent.RemoveChild(n)
	}

	if n.Type == html.ElementNode && n.FirstChild == nil &&
		(n.Data == "div" || n.Data == "span" || n.Data == "li" || n.Data == "p") {
		n.Parent.RemoveChild(n)
	}

	// spans with less than 2 characters inside => flatten to text
	only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild
	if n.Type == html.ElementNode &&
		n.Data == "span" &&
		only1Child &&
		n.FirstChild.Type == html.TextNode &&
		len(strings.TrimSpace(n.FirstChild.Data)) < 3 {
		n.Type = html.TextNode
		n.Data = n.FirstChild.Data
		n.RemoveChild(n.FirstChild)
	}

}
Example #17
0
func TestParseATagAbsoluteDiffHost(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"
	attr := html.Attribute{"", "href", "http://www.google.com"}
	node.Attr = []html.Attribute{attr}

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected := 0
	val := page.links.Len()
	if val != expected {
		t.Error("Expected:", expected, " Got:", val)
	}
}
Example #18
0
func TestParseLinkTagNoRel(t *testing.T) {
	node := new(html.Node)
	node.Data = "link"
	attr1 := html.Attribute{"", "href", "1.css"}
	node.Attr = []html.Attribute{attr1}

	page := newWebPage(startUrl)
	page.parseLinkTag(node)

	expected1 := 0
	val1 := page.styleSheets.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}
Example #19
0
func TestParseATagInvalidUrl(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"
	attr := html.Attribute{"", "href", "%gh&%ij"}
	node.Attr = []html.Attribute{attr}

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected1 := 0
	val1 := page.links.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}
Example #20
0
func TestParseLinkTagInvalidUrl(t *testing.T) {
	node := new(html.Node)
	node.Data = "link"
	attr1 := html.Attribute{"", "href", "%gh&%ij"}
	attr2 := html.Attribute{"", "rel", "stylesheet"}
	node.Attr = []html.Attribute{attr1, attr2}

	page := newWebPage(startUrl)
	page.parseLinkTag(node)

	expected1 := 0
	val1 := page.styleSheets.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}
Example #21
0
func dedupApply(n *html.Node, dedups map[string]bool) {

	// Children
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		dedupApply(c, dedups)
	}

	if n.Type == html.ElementNode {
		outline := attrX(n.Attr, "ol") + "."

		if dedups[outline] {
			n.Type = html.CommentNode
			n.Data = n.Data + " replaced"
		}
	}

}
Example #22
0
// clean normalises styles/colspan and removes any CleanTags specified, along with newlines;
// but also makes all the character handling (for example "&#160;" as utf-8) the same.
// It returns the estimated number of treeRunes that will be used.
// TODO more cleaning of the input HTML, as required.
func (c *Config) clean(n *html.Node) int {
	size := 1
	switch n.Type {
	case html.ElementNode:
		for ai := 0; ai < len(n.Attr); ai++ {
			a := n.Attr[ai]
			switch {
			case strings.ToLower(a.Key) == "style":
				if strings.TrimSpace(a.Val) == "" { // delete empty styles
					n.Attr = delAttr(n.Attr, ai)
					ai--
				} else { // tidy non-empty styles
					// TODO there could be more here to make sure the style entries are in the same order etc.
					n.Attr[ai].Val = strings.Replace(a.Val, " ", "", -1)
					if !strings.HasSuffix(n.Attr[ai].Val, ";") {
						n.Attr[ai].Val += ";"
					}
				}
			case n.DataAtom == atom.Td &&
				strings.ToLower(a.Key) == "colspan" &&
				strings.TrimSpace(a.Val) == "1":
				n.Attr = delAttr(n.Attr, ai)
				ai--
			}
		}
	case html.TextNode:
		n.Data = htm.UnescapeString(n.Data)
		size += utf8.RuneCountInString(n.Data) - 1 // len(n.Data) would be faster, but use more memory
	}
searchChildren:
	for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
		switch ch.Type {
		case html.ElementNode:
			for _, rr := range c.CleanTags {
				if rr == ch.Data {
					n.RemoveChild(ch)
					goto searchChildren
				}
			}
		}
		size += c.clean(ch)
	}
	return size
}
Example #23
0
func TestParseATagRelative(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"
	attr := html.Attribute{"", "href", "1.html"}
	node.Attr = []html.Attribute{attr}

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected1 := 1
	val1 := page.links.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
	expected2 := startUrl + "1.html"
	val2 := page.links.Front().Value
	if val2 != expected2 {
		t.Error("Expected:", expected2, " Got:", val2)
	}
}
Example #24
0
func TestParseScriptTagAbsolute(t *testing.T) {
	node := new(html.Node)
	node.Data = "script"
	attr := html.Attribute{"", "src", startUrl + "1.js"}
	node.Attr = []html.Attribute{attr}

	page := newWebPage(startUrl)
	page.parseScriptTag(node)

	expected1 := 1
	val1 := page.scriptFiles.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
	expected2 := startUrl + "1.js"
	val2 := page.scriptFiles.Front().Value
	if val2 != expected2 {
		t.Error("Expected:", expected2, " Got:", val2)
	}
}
Example #25
0
func forceMaxDepth(n *html.Node, depth int) {
	if depth == 0 {
		n.Type = html.TextNode
		n.FirstChild, n.LastChild = nil, nil
		n.Attr = nil
		n.Data = "[omitted]"
		for n.NextSibling != nil {
			n.Parent.RemoveChild(n.NextSibling)
		}
		return
	}

	if n.Type != html.ElementNode {
		return
	}

	for c := n.FirstChild; c != nil; c = c.NextSibling {
		forceMaxDepth(c, depth-1)
	}
}
Example #26
0
// append1 actually appends to the merged HTML node tree.
func (ap *appendContext) append1(action rune, text string, proto *html.Node, pos posT) {
	if proto == nil {
		return
	}
	appendPoint, protoAncestor := ap.lastMatchingLeaf(proto, action, pos)
	if appendPoint == nil || protoAncestor == nil {
		return
	}
	if appendPoint.DataAtom != protoAncestor.DataAtom {
		return
	}
	newLeaf := new(html.Node)
	copyNode(newLeaf, proto)
	if proto.Type == html.TextNode {
		newLeaf.Data = text
	}
	if action != '=' {
		insertNode := &html.Node{
			Type:     html.ElementNode,
			DataAtom: atom.Span,
			Data:     "span",
		}
		switch action {
		case '+':
			insertNode.Attr = convertAttributes(ap.c.InsertedSpan)
		case '-':
			insertNode.Attr = convertAttributes(ap.c.DeletedSpan)
		case '~':
			insertNode.Attr = convertAttributes(ap.c.ReplacedSpan)
		}
		insertNode.AppendChild(newLeaf)
		newLeaf = insertNode
	}
	for proto = proto.Parent; proto != nil && proto != protoAncestor; proto = proto.Parent {
		above := new(html.Node)
		copyNode(above, proto)
		above.AppendChild(newLeaf)
		newLeaf = above
	}
	appendPoint.AppendChild(newLeaf)
}
Example #27
0
func TestParseLinkTagRelative(t *testing.T) {
	node := new(html.Node)
	node.Data = "link"
	attr1 := html.Attribute{"", "href", "1.css"}
	attr2 := html.Attribute{"", "rel", "stylesheet"}
	node.Attr = []html.Attribute{attr1, attr2}

	page := newWebPage(startUrl)
	page.parseLinkTag(node)

	expected1 := 1
	val1 := page.styleSheets.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
	expected2 := startUrl + "1.css"
	val2 := page.styleSheets.Front().Value
	if val2 != expected2 {
		t.Error("Expected:", expected2, " Got:", val2)
	}
}
Example #28
0
// convertExotic standardizes <section> or <header> nodes
// towards <div> nodes.
func convertExotic(n *html.Node) {
	if repl, ok := exotics[n.Data]; ok {
		n.Attr = append(n.Attr, html.Attribute{"", "cfrm", n.Data})
		n.Data = repl
	}
}
Example #29
0
func (c *Cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document {
	if c.config.debug {
		log.Println("Starting to replace bad divs...")
	}
	badDivs := 0
	convertedTextNodes := 0
	divs := doc.Find(domType)

	divs.Each(func(i int, div *goquery.Selection) {
		divHTML, _ := div.Html()
		if divToPElementsPattern.Match([]byte(divHTML)) {
			c.replaceWithPara(div)
			badDivs++
		} else {
			var replacementText []string
			nodesToRemove := list.New()
			children := div.Contents()
			if c.config.debug {
				log.Printf("Found %d children of div\n", children.Size())
			}
			children.EachWithBreak(func(i int, kid *goquery.Selection) bool {
				text := kid.Text()
				kidNode := kid.Get(0)
				tag := kidNode.Data
				if tag == text {
					tag = "#text"
				}
				if tag == "#text" {
					text = strings.Replace(text, "\n", "", -1)
					text = tabsRegEx.ReplaceAllString(text, "")
					if text == "" {
						return true
					}
					if len(text) > 1 {
						prev := kidNode.PrevSibling
						if c.config.debug {
							log.Printf("PARENT CLASS: %s NODENAME: %s\n", c.config.parser.name("class", div), tag)
							log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1))
						}
						if prev != nil && prev.DataAtom == atom.A {
							nodeSelection := kid.HasNodes(prev)
							html, _ := nodeSelection.Html()
							replacementText = append(replacementText, html)
							if c.config.debug {
								log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html)
							}
						}
						replacementText = append(replacementText, text)
						nodesToRemove.PushBack(kidNode)
						convertedTextNodes++
					}

				}
				return true
			})

			newNode := new(html.Node)
			newNode.Type = html.ElementNode
			newNode.Data = strings.Join(replacementText, "")
			newNode.DataAtom = atom.P
			div.First().AddNodes(newNode)

			for s := nodesToRemove.Front(); s != nil; s = s.Next() {
				node := s.Value.(*html.Node)
				if node != nil && node.Parent != nil {
					node.Parent.RemoveChild(node)
				}
			}
		}
	})
	if c.config.debug {
		log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes)
	}
	return doc

}
Example #30
0
//从nodes中找到node  根据index  和 属性  先index
func findNodeformNodesbyIndexOrPro(nodes []*goquery.Selection, index *int, m map[string]string, Type string, visible bool) {

	switch {
	case Type == OPTION || Type == RADIO:

		for _, v := range nodes {
			for _, vv := range v.Get(0).Attr {
				if vv.Key == VALUE {

					if vv.Val == m[VALUE] {
						if Type == RADIO {
							v.SetAttr("checked", "checked")
						} else {
							v.SetAttr("selected", "selected")
						}

						return
					}

				}
			}
		}
		if visible {
			var node html.Node
			node.Data = nodes[0].Get(0).Data
			node.Type = nodes[0].Get(0).Type

			attr := make([]html.Attribute, 0, 2)
			var tr html.Attribute
			tr.Key = VALUE
			tr.Val = m[VALUE]

			attr = append(attr, tr)
			if Type == RADIO {
				tr.Key = "checked"
				tr.Val = "checked"
			} else {
				tr.Key = "selected"
				tr.Val = "selected"
			}

			attr = append(attr, tr)

			tr.Key = TYPE
			tr.Val = Type
			attr = append(attr, tr)

			node.Attr = attr
			nodes[0].Parent().AppendNodes(&node)
		}
		return
	default:
	}

	if len(nodes) <= *index {
		return
	}
	for k, v := range m {
		nodes[*index].SetAttr(k, v)
	}

	*index++
}