Пример #1
1
// cleansDom performs brute reduction and simplification
//
func cleanseDom(n *html.Node, lvl int) {

	n.Attr = removeAttr(n.Attr, unwantedAttrs)

	// Children
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		cleanseDom(c, lvl+1)
	}

	if directlyRemoveUnwanted {
		removeUnwanted(n)
	} else {
		convertUnwanted(n)
	}

	// ---

	convertExotic(n)

	// one time text normalization
	if n.Type == html.TextNode {
		n.Data = stringspb.NormalizeInnerWhitespace(n.Data)
	}

}
Пример #2
1
func (m *minificationHTML) toDiv(node *html.Node) (*html.Node, error) {
	node.DataAtom = atom.Div
	node.Data = "div"
	node.Attr = nil

	return m.parseChildren(node)
}
Пример #3
1
// Attache attaches these attributes to an html.Node.
func (g HTML) Attach(node *html.Node) {
	attrs := []html.Attribute{}
	if g.ContentEditable > 0 {
		if g.ContentEditable == OTrue {
			attrs = attr(attrs, "contenteditable", "true")
		} else {
			attrs = attr(attrs, "contenteditable", "false")
		}
	}
	if g.Hidden > 0 {
		if g.Hidden == OTrue {
			attrs = attr(attrs, "hidden", "true")
		} else {
			attrs = attr(attrs, "hidden", "false")
		}
	}

	if len(g.Data) > 0 {
		for k, v := range g.Data {
			attrs = attr(attrs, k, v)
		}
	}

	if len(g.Class) > 0 {
		v := strings.Join(g.Class, " ")
		attrs = attr(attrs, "class", v)
	}

	s := []string{"AccessKey", "Id", "Dir", "Lang", "Style", "TabIndex", "Title", "Translate"}
	attrs = append(attrs, structToAttrs(g, s...)...)

	node.Attr = append(node.Attr, attrs...)
}
Пример #4
0
// convert nodes to /x/net/html.Node siblings.
// Document node children are integrated as siblings.
// Nils are skipped.
func (s Siblings) convert(parent *html.Node) (first, last *html.Node) {
	var prev *html.Node
	for _, n := range s {
		if n == nil {
			continue
		}
		if n.Type == html.DocumentNode {
			start, end := n.Children.convert(parent)
			if prev != nil {
				prev.NextSibling = start
			} else {
				first = start
			}
			prev = end
			continue
		}
		h := n.convert()
		h.Parent = parent
		h.PrevSibling = prev
		if prev != nil {
			prev.NextSibling = h
		} else {
			first = h
		}
		prev = h
	}
	return first, prev
}
Пример #5
0
// finds article's title and body in ria.ru html style
// works cleary on 15.12.2015
func FindTitleAndBody_Ria(node *html.Node) (*html.Node, *html.Node) {
	var title, fulltext *html.Node

	if node.Type == html.ElementNode {
		for _, tag := range node.Attr {
			if tag.Key == "itemprop" {
				if tag.Val == "articleBody" {
					node.Data = "body"
					fulltext = node
					break
				}
				if tag.Val == "name" {
					node.Data = "title"
					title = node
					break
				}
			}
		}
	}

	for c := node.FirstChild; c != nil; c = c.NextSibling {
		ptitle, pfulltext := FindTitleAndBody_Ria(c)
		if ptitle != nil {
			title = ptitle
		}
		if pfulltext != nil {
			fulltext = pfulltext
		}
		if title != nil && fulltext != nil {
			break
		}
	}
	return title, fulltext

}
Пример #6
0
func FindTitleAndBody_MK(node *html.Node) (*html.Node, *html.Node) {
	var title, fulltext *html.Node

	if node.Type == html.ElementNode {
		for _, tag := range node.Attr {
			if tag.Key == "class" {
				if tag.Val == "content" {
					title = FindTitleMK(node)
					node.Data = "body"
					fulltext = node
					break
				}
			}
		}
	}

	for c := node.FirstChild; c != nil; c = c.NextSibling {
		ptitle, pfulltext := FindTitleAndBody_MK(c)
		if ptitle != nil {
			title = ptitle
			title.Data = "title"
		}
		if pfulltext != nil {
			fulltext = pfulltext
		}
		if title != nil && fulltext != nil {
			break
		}
	}
	return title, fulltext

}
Пример #7
0
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
	ps := make([]*goquery.Selection, 0)
	if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
		ps = append(ps, currentSibling)
		return ps
	} else {
		potentialParagraphs := currentSibling.Find("p")
		potentialParagraphs.Each(func(i int, s *goquery.Selection) {
			text := s.Text()
			if len(text) > 0 {
				ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text)
				paragraphScore := ws.stopWordCount
				siblingBaselineScore := 0.30
				highLinkDensity := this.isHighLinkDensity(s)
				score := siblingBaselineScore * baselinescoreSiblingsPara
				if score < float64(paragraphScore) && !highLinkDensity {
					node := new(html.Node)
					node.Type = html.TextNode
					node.Data = text
					node.DataAtom = atom.P
					nodes := make([]*html.Node, 1)
					nodes[0] = node
					newSelection := new(goquery.Selection)
					newSelection.Nodes = nodes
					ps = append(ps, newSelection)
				}
			}

		})
	}
	return ps
}
Пример #8
0
func wrapText(nodes []*html.Node) []*html.Node {
	wrapped := make([]*html.Node, 0, len(nodes))
	var wrapper *html.Node
	appendWrapper := func() {
		if wrapper != nil {
			// render and re-parse so p-inline-p expands
			wrapped = append(wrapped, ParseDepth(Render(wrapper), 0)...)
			wrapper = nil
		}
	}
	for _, n := range nodes {
		if n.Type == html.ElementNode && isBlockElement[n.DataAtom] {
			appendWrapper()
			wrapped = append(wrapped, n)
			continue
		}
		if wrapper == nil && n.Type == html.TextNode && strings.TrimSpace(n.Data) == "" {
			wrapped = append(wrapped, n)
			continue
		}
		if wrapper == nil {
			wrapper = &html.Node{
				Type:     html.ElementNode,
				Data:     "p",
				DataAtom: atom.P,
			}
		}

		wrapper.AppendChild(n)
	}
	appendWrapper()
	return wrapped
}
Пример #9
0
func CompactNode(n *html.Node) {
	var appendNodes []*html.Node
	for c := n.FirstChild; c != nil; {
		CompactNode(c)
		if _mergeTextElements[c.Data] {
			appendNodes = append(appendNodes, GetChildNodes(c)...)
			log.Info("delete", c.Data)
			c = RemoveNode(c)
		} else if c.Type == html.ElementNode && c.FirstChild == nil && !_voidElements[c.Data] {
			log.Info("delete", c.Data)
			c = RemoveNode(c)
		} else {
			c = c.NextSibling
		}
	}

	DetachNodes(appendNodes)
	AppendChildNodes(n, appendNodes)
	if n.FirstChild != nil && n.FirstChild.NextSibling == nil {
		if n.FirstChild.Data == n.Data || (n.FirstChild.Data == "br" && (n.Data == "p" || n.Data == "div")) {
			childNodes := GetChildNodes(n.FirstChild)
			log.Info("delete", n.FirstChild.Data)
			n.RemoveChild(n.FirstChild)
			DetachNodes(childNodes)
			AppendChildNodes(n, childNodes)
		} else if n.FirstChild.Data == "img" && n.Data == "a" {
			*n = *n.FirstChild
		}
	}
}
Пример #10
0
func img2Link(img *html.Node) {

	if img.Data == "img" {

		img.Data = "a"
		for i := 0; i < len(img.Attr); i++ {
			if img.Attr[i].Key == "src" {
				img.Attr[i].Key = "href"
			}
		}

		double := closureTextNodeExists(img)
		imgContent := ""
		title := attrX(img.Attr, "title")

		if double {
			imgContent = fmt.Sprintf("[img] %v %v | ",
				"[ctdr]", // content title double removed
				urlBeautify(attrX(img.Attr, "href")))

		} else {
			imgContent = fmt.Sprintf("[img] %v %v | ",
				title,
				urlBeautify(attrX(img.Attr, "href")))
		}

		img.Attr = attrSet(img.Attr, "cfrom", "img")
		nd := dom.Nd("text", imgContent)
		img.AppendChild(nd)
	}

}
Пример #11
0
func copyNode(to, from *html.Node) {
	to.Attr = from.Attr
	to.Data = from.Data
	to.DataAtom = from.DataAtom
	to.Namespace = from.Namespace
	to.Type = from.Type
}
Пример #12
0
func removeNegativeAttributeMatches(n *html.Node) *html.Node {
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		if c.Type != html.TextNode && containerregrex.MatchString(c.Data) {
			for _, attr := range c.Attr {
				key := strings.ToLower(attr.Key)
				if key == "id" || key == "class" {
					val := strings.ToLower(attr.Val)
					values := nonwordregex.Split(val, -1)
					penalty := 0
					for _, value := range values {
						if negativeregex.MatchString(value) {
							penalty = penalty + 4
						}
					}
					if penalty > 0 {
						if c.PrevSibling != nil {
							c.PrevSibling.NextSibling = c.NextSibling
						} else {
							n.FirstChild = c.NextSibling
						}
					} else {
						d := removeNegativeAttributeMatches(c)
						if c.PrevSibling != nil {
							c.PrevSibling.NextSibling = d
						} else {
							n.FirstChild = c.NextSibling
						}
					}
				}
			}
		}
	}
	return n
}
Пример #13
0
func reIndent(n *html.Node, lvl int) {

	if lvl > cScaffoldLvls && n.Parent == nil {
		bb := dom.PrintSubtree(n)
		_ = bb
		// log.Printf("%s", bb.Bytes())
		hint := ""
		if ml3[n] > 0 {
			hint = "   from ml3"
		}
		log.Print("reIndent: no parent ", hint)
		return
	}

	// Before children processing
	switch n.Type {
	case html.ElementNode:
		if lvl > cScaffoldLvls && n.Parent.Type == html.ElementNode {
			ind := strings.Repeat("\t", lvl-2)
			dom.InsertBefore(n, &html.Node{Type: html.TextNode, Data: "\n" + ind})
		}
	case html.CommentNode:
		dom.InsertBefore(n, &html.Node{Type: html.TextNode, Data: "\n"})
	case html.TextNode:
		n.Data = strings.TrimSpace(n.Data) + " "
		if !strings.HasPrefix(n.Data, ",") && !strings.HasPrefix(n.Data, ".") {
			n.Data = " " + n.Data
		}
		// link texts without trailing space
		if n.Parent != nil && n.Parent.Data == "a" {
			n.Data = strings.TrimSpace(n.Data)
		}
	}

	// Children
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		reIndent(c, lvl+1)
	}

	// After children processing
	switch n.Type {
	case html.ElementNode:
		// I dont know why,
		// but this needs to happend AFTER the children
		if lvl > cScaffoldLvls && n.Parent.Type == html.ElementNode {
			ind := strings.Repeat("\t", lvl-2)
			ind = "\n" + ind
			// link texts without new line
			if n.Data == "a" {
				ind = ""
			}
			if n.LastChild != nil {
				dom.InsertAfter(n.LastChild, &html.Node{Type: html.TextNode, Data: ind})
			}
		}
	}

}
Пример #14
0
/*
   div                     div
       div                     p
           p         TO        img
           img                 p
           p


	Operates from the *middle* div.
	Saves all children in inverted slice.
	Removes each child and reattaches it one level higher.
	Finally the intermediary, now childless div is removed.




   \                  /
    \       /\       /
     \_____/  \_____/

     \              /
      \_____/\_____/

       \__________/     => Breaks are gone


       \p1___p2___/     => Wrapping preserves breaks




*/
func topDownV1(n *html.Node, couple []string, parentType string) {

	if noParent(n) {
		return
	}
	p := n.Parent

	parDiv := p.Type == html.ElementNode && p.Data == couple[0] // Parent is a div
	iAmDiv := n.Type == html.ElementNode && n.Data == couple[1] // I am a div

	noSiblings := n.PrevSibling == nil && n.NextSibling == nil

	only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild
	svrlChildn := n.FirstChild != nil && n.FirstChild != n.LastChild
	noChildren := n.FirstChild == nil

	_, _ = noSiblings, noChildren

	if parDiv && iAmDiv {

		if only1Child || svrlChildn {

			var children []*html.Node
			for c := n.FirstChild; c != nil; c = c.NextSibling {
				children = append([]*html.Node{c}, children...) // order inversion
			}

			insertionPoint := n.NextSibling
			for _, c1 := range children {

				n.RemoveChild(c1)

				if c1.Type == html.TextNode || c1.Data == "a" {
					// pf("wrapping %v\n", NodeTypeStr(c1.Type))
					wrap := html.Node{Type: html.ElementNode, Data: "p",
						Attr: []html.Attribute{html.Attribute{Key: "cfrm", Val: "div"}}}
					wrap.FirstChild = c1
					p.InsertBefore(&wrap, insertionPoint)
					c1.Parent = &wrap
					insertionPoint = &wrap

				} else {
					p.InsertBefore(c1, insertionPoint)
					insertionPoint = c1
				}

			}
			p.RemoveChild(n)
			if p.Data != parentType {
				p.Data = parentType
			}

		}

	}

}
Пример #15
0
func runMergeNodes(parent, prev, next *html.Node, addSeparator bool) *html.Node {
	var u parserUtils
	if prev != nil {
		parent.AppendChild(prev)
	}
	if next != nil {
		parent.AppendChild(next)
	}
	return u.mergeNodes(parent, prev, next, addSeparator)
}
Пример #16
0
// We want to remove some children.
// A direct loop is impossible,
// since "NextSibling" is set to nil during Remove().
// Therefore:
//   First assemble children separately.
//   Then remove them.
func removeUnwanted(n *html.Node) {
	cc := []*html.Node{}
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		cc = append(cc, c)
	}
	for _, c := range cc {
		if unwanteds[c.Data] {
			n.RemoveChild(c)
		}
	}
}
Пример #17
0
// Replace the given node's children with the given string.
func setNodeText(node *html.Node, s string) {
	// remove all existing children
	for node.FirstChild != nil {
		node.RemoveChild(node.FirstChild)
	}
	// add the text
	node.AppendChild(&html.Node{
		Type: html.TextNode,
		Data: s,
	})
}
Пример #18
0
func (m *minificationText) openTag(node *html.Node) {
	parent := node.Parent
	for it := node.FirstChild; it != nil; it = it.NextSibling {
		it.Parent = parent
	}
	parent.FirstChild = node.FirstChild
	parent.LastChild = node.LastChild
	node.FirstChild = nil
	node.LastChild = nil
	node.Parent = nil
}
Пример #19
0
func removeUnwanted(n *html.Node) {
	cc := []*html.Node{}
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		cc = append(cc, c)
	}
	for _, c := range cc {
		if n.Type == html.ElementNode && n.Data == "script" || n.Type == html.CommentNode {
			n.RemoveChild(c)
		}
	}
}
Пример #20
0
func TestIsIcoNode(t *testing.T) {
	n := html.Node{Data: "link"}
	if isIcoNode(&n) {
		t.Errorf("Exepected node to not be a ico node")
	}

	n.Attr = []html.Attribute{
		html.Attribute{Key: "rel", Val: "icon"},
	}
	if !isIcoNode(&n) {
		t.Errorf("Exepected node to be a ico node")
	}
}
Пример #21
0
func TestIsStylesheetNode(t *testing.T) {
	n := html.Node{Data: "link"}
	if isStylesheetNode(&n) {
		t.Errorf("Exepected node to not be a stylesheet node")
	}

	n.Attr = []html.Attribute{
		html.Attribute{Key: "rel", Val: "stylesheet"},
	}
	if !isStylesheetNode(&n) {
		t.Errorf("Exepected node to be a stylesheet node")
	}
}
Пример #22
0
func replaceNodeWithChildren(n *html.Node) {
	var next *html.Node
	parent := n.Parent

	for c := n.FirstChild; c != nil; c = next {
		next = c.NextSibling
		n.RemoveChild(c)

		parent.InsertBefore(c, n)
	}

	parent.RemoveChild(n)
}
Пример #23
0
func TestParseScriptTagNoSrc(t *testing.T) {
	node := new(html.Node)
	node.Data = "script"

	page := newWebPage(startUrl)
	page.parseScriptTag(node)

	expected1 := 0
	val1 := page.scriptFiles.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}
Пример #24
0
func TestParseATagNoHref(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected1 := 0
	val1 := page.links.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}
Пример #25
0
func (u *parserUtils) addChildTextNodeToBegining(node *html.Node, text string) {
	if node.FirstChild != nil && node.FirstChild.Type == html.TextNode {
		node.FirstChild.Data = text + node.FirstChild.Data
	} else {
		newNode := &html.Node{
			Type: html.TextNode,
			Data: text}
		if node.FirstChild == nil {
			node.AppendChild(newNode)
		} else {
			node.InsertBefore(newNode, node.FirstChild)
		}
	}
}
Пример #26
0
func removeEmptyNodes(n *html.Node, lvl int) {

	// children
	cc := []*html.Node{}
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		cc = append(cc, c)
	}
	for _, c := range cc {
		removeEmptyNodes(c, lvl+1)
	}

	// processing
	// empty element nodes
	if n.Type == html.ElementNode && n.Data == "img" {
		src := attrX(n.Attr, "src")
		if src == "" {
			n.Parent.RemoveChild(n)
		}
	}

	if n.Type == html.ElementNode && n.FirstChild == nil && n.Data == "a" {
		href := attrX(n.Attr, "href")
		if href == "#" || href == "" {
			n.Parent.RemoveChild(n)
		}
	}

	if n.Type == html.ElementNode && n.FirstChild == nil &&
		(n.Data == "em" || n.Data == "strong") {
		n.Parent.RemoveChild(n)
	}

	if n.Type == html.ElementNode && n.FirstChild == nil &&
		(n.Data == "div" || n.Data == "span" || n.Data == "li" || n.Data == "p") {
		n.Parent.RemoveChild(n)
	}

	// spans with less than 2 characters inside => flatten to text
	only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild
	if n.Type == html.ElementNode &&
		n.Data == "span" &&
		only1Child &&
		n.FirstChild.Type == html.TextNode &&
		len(strings.TrimSpace(n.FirstChild.Data)) < 3 {
		n.Type = html.TextNode
		n.Data = n.FirstChild.Data
		n.RemoveChild(n.FirstChild)
	}

}
Пример #27
0
func TestParseLinkTagNoRel(t *testing.T) {
	node := new(html.Node)
	node.Data = "link"
	attr1 := html.Attribute{"", "href", "1.css"}
	node.Attr = []html.Attribute{attr1}

	page := newWebPage(startUrl)
	page.parseLinkTag(node)

	expected1 := 0
	val1 := page.styleSheets.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}
Пример #28
0
func TestParseATagAbsoluteDiffHost(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"
	attr := html.Attribute{"", "href", "http://www.google.com"}
	node.Attr = []html.Attribute{attr}

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected := 0
	val := page.links.Len()
	if val != expected {
		t.Error("Expected:", expected, " Got:", val)
	}
}
Пример #29
0
func TestParseATagInvalidUrl(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"
	attr := html.Attribute{"", "href", "%gh&%ij"}
	node.Attr = []html.Attribute{attr}

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected1 := 0
	val1 := page.links.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}
Пример #30
0
func RemoveAttributes(n *html.Node, keepAttrs []string) {
	attrs := make([]html.Attribute, 0, len(n.Attr))
	dataAttrs := make(map[string]html.Attribute)
	originalAttrs := make(map[string]html.Attribute)
	for _, a := range n.Attr {
		if indexOfString(keepAttrs, a.Key) >= 0 {
			attrs = append(attrs, a)
			originalAttrs[a.Key] = a
		} else if i := strings.Index(a.Key, "data-"); i == 0 {
			a.Key = a.Key[5:]
			if len(a.Key) > 0 && indexOfString(keepAttrs, a.Key) >= 0 {
				dataAttrs[a.Key] = a
			}
		}
	}

	for k, v := range dataAttrs {
		if _, found := originalAttrs[k]; !found {
			attrs = append(attrs, v)
		}
	}

	n.Attr = attrs
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		RemoveAttributes(c, keepAttrs)
	}
}