Example #1
1
func (this *flowdocument_maker) convert_flowdocument(frag *html.Node) {
	if frag.Type == html.TextNode {
		return
	}
	ignore_children := false
	switch frag.Data {
	case "img":
		frag.Type = html.CommentNode
		node_clear_children(frag)
		frag.Attr = nil
	case "a":
		frag.Data = "Hyperlink"
		frag.Attr = extract_ahref_attr(frag.Attr)
	case "article":
		frag.Data = "FlowDocument"
		// set namespace dont work
		frag.Attr = []html.Attribute{html.Attribute{Key: "xmlns", Val: fdocns}}
	case "object", "video", "audio", "embed":
		frag.Type = html.CommentNode
		node_clear_children(frag)
		frag.Attr = nil
	case "p":
		fallthrough
	default:
		frag.Data = "Paragraph"
		frag.Attr = nil
		if this.first_paragraph == nil {
			this.first_paragraph = frag
		}
	}
	for child := frag.FirstChild; ignore_children == false && child != nil; child = child.NextSibling {
		this.convert_flowdocument(child)
	}
}
Example #2
0
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
	ps := make([]*goquery.Selection, 0)
	if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
		ps = append(ps, currentSibling)
		return ps
	} else {
		potentialParagraphs := currentSibling.Find("p")
		potentialParagraphs.Each(func(i int, s *goquery.Selection) {
			text := s.Text()
			if len(text) > 0 {
				ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text)
				paragraphScore := ws.stopWordCount
				siblingBaselineScore := 0.30
				highLinkDensity := this.isHighLinkDensity(s)
				score := siblingBaselineScore * baselinescoreSiblingsPara
				if score < float64(paragraphScore) && !highLinkDensity {
					node := new(html.Node)
					node.Type = html.TextNode
					node.Data = text
					node.DataAtom = atom.P
					nodes := make([]*html.Node, 1)
					nodes[0] = node
					newSelection := new(goquery.Selection)
					newSelection.Nodes = nodes
					ps = append(ps, newSelection)
				}
			}

		})
	}
	return ps
}
Example #3
0
func node_clear_children(frag *html.Node) {
	for child := frag.FirstChild; child != nil; {
		next := child.NextSibling
		frag.RemoveChild(child)
		child = next
	}
}
Example #4
0
func clean_element_before_header(body *html.Node, name string) {
	child := body.FirstChild
	for child != nil {
		if child.Type == html.ElementNode && child.Data != name {
			next := child.NextSibling
			body.RemoveChild(child)
			child = next
		} else {
			break
		}
	}
}
Example #5
0
func remove_decentant(n *html.Node, tag string) {
	child := n.FirstChild
	for child != nil {
		if child.Type == html.ElementNode && child.Data == tag {
			next := child.NextSibling
			n.RemoveChild(child)
			child = next
		} else {
			remove_decentant(child, tag)
			child = child.NextSibling
		}
	}
}
Example #6
0
func html_clean_root(root *html.Node, uribase string) (*html.Node, []*html.Node) {
	cleaner := &html_cleaner{}
	cleaner.current_url, _ = url.Parse(uribase)
	cleaner.html_drop_unprintable(root)
	cleaner.remove_head()

	var (
		h1l = len(cleaner.header1s)
		h2l = len(cleaner.header2s)
		h3l = len(cleaner.header3s)
		h4l = len(cleaner.header4s)
	)
	alter := false
	//文档中如果只有一个h1,通常这个h1所在的div就是文档内容
	if h1l == 1 { // only one h1
		ab := find_article_via_header_i(cleaner.header1s[0])
		alter = cleaner.try_update_article(ab)
		if !alter && cleaner.title_similar(cleaner.header1s[0].Data) {
			alter = true
			cleaner.article = ab
		}
	}
	//如果文档中只有一个h2,这时又没有h1,h2就是其中的标题,所在的div就是文档内容
	if h1l == 0 && h2l == 1 {
		ab := find_article_via_header_i(cleaner.header2s[0])
		alter = alter || cleaner.try_update_article(ab)
	}
	if alter == false && h3l == 1 {
		ab := find_article_via_header_i(cleaner.header3s[0])
		alter = alter || cleaner.try_update_article(ab)
	}
	if alter == false && h4l == 1 {
		ab := find_article_via_header_i(cleaner.header4s[0])
		alter = alter || cleaner.try_update_article(ab)
	}
	if cleaner.article == nil {
		cleaner.article = &html.Node{Type: html.ElementNode,
			DataAtom: atom.Body,
			Data:     "body"}
		root.AppendChild(cleaner.article)
	}
	cleaner.fix_forms() // may alter form to div, so do this before try_catch_phpwind
	cleaner.try_catch_phpwnd()
	cleaner.clean_body()
	cleaner.clean_empty_nodes(cleaner.article)
	cleaner.clean_attributes(cleaner.article)

	return cleaner.article, cleaner.iframes
}
Example #7
0
func node_append_children(src *html.Node, target *html.Node) {
	foreach_child(src, func(child *html.Node) {
		switch {
		case child.Type == html.TextNode:
			target.AppendChild(create_text(child.Data))
		case child.Data == "a" || node_is_object(child):
			// ommit all children elements
			a := shallow_clone_element(child)
			node_append_children(child, a)
			target.AppendChild(a)
		default:
			node_append_children(child, target)
		}
	})
}
Example #8
0
func trim_small_image(img *html.Node) (drop bool) {
	width, height, _ := media_get_dim(img)

	if img.Parent == nil {
		return
	}
	if width > 0 && height > 0 && width*height < small_image_t*small_image_t && img.Parent.Data == "a" {
		img.Data = "input"
		drop = true
	} else if width == 1 && height == 1 {
		img.Data = "input"
		drop = true
	}
	return
}
Example #9
0
// flatten inlines text image a object video audio seq
// n is element-node
// inline node may have div element
func (this *html_cleaner) clean_inline_node(n *html.Node) {
	inlines := this.flatten_inline_node(n)

	for child := n.FirstChild; child != nil; child = n.FirstChild {
		n.RemoveChild(child)
	}
	for _, inline := range inlines {
		p := inline.Parent
		if p != nil {
			p.RemoveChild(inline) //			this.article.RemoveChild(child)

		}
		n.AppendChild(inline)
	}
}
Example #10
0
//删除行前后空白
func (this *html_cleaner) trim_empty_spaces_func(n *html.Node, trim func(string) string) {
	child := n.FirstChild
	for child != nil {
		if child.Type == html.TextNode {
			child.Data = trim(child.Data)
		} else {
			this.trim_empty_spaces_func(child, trim)
		}
		if node_is_not_empty(child) {
			break
		}
		next := child.NextSibling
		n.RemoveChild(child)
		child = next
	}
}
Example #11
0
// The splitting process may leave TextNodes with no Data, which we keep
// around to make the data manipulation simpler. This function removes
// them.
func cleanupTree(node *html.Node) {
	var next *html.Node
	for n := node.FirstChild; n != nil; n = next {
		next = n.NextSibling
		switch n.Type {
		case html.TextNode:
			if len(n.Data) == 0 {
				node.RemoveChild(n)
			}
		case html.ElementNode:
			cleanupTree(n)
		default:
			// ignore other node types.
		}
	}
}
Example #12
0
func trim_display_none(n *html.Node) {
	st := get_attribute(n, "style")
	if strings.Contains(st, "display") && (strings.Contains(st, "none")) {
		//		log.Println("hide-node display:none", n.Data)
		n.Data = "input"
	}
}
Example #13
0
func processTextNode(node *html.Node, tags []openTag) (outTags []openTag, next *html.Node, err error) {
	i := 0
	for i < len(node.Data) {
		r, rsize := utf8.DecodeRuneInString(node.Data[i:])
		switch r {
		case '[':
			size, openClose, tag, rest := parseShortcode(node.Data[i+1:])
			if size != 0 {
				// looks like we found a shortcode!
				if tag == "" { // escape code?
					// remove the outer [] and continue
					node.Data = node.Data[:i] + rest + node.Data[i+1+size:]
					i += len(rest)
				} else {
					return handleShortcode(node, tags, i, i+1+size, openClose, tag, rest)
				}
			} else {
				i += rsize
			}

		default:
			i += rsize
		}
	}

	// default: no shortcode found
	outTags = tags
	next = node.NextSibling
	err = nil
	return
}
Example #14
0
func try_update_class_attr(b *html.Node, class string) {
	if len(class) > 0 {
		ca := make([]html.Attribute, len(b.Attr)+1)
		copy(ca, b.Attr)
		ca[len(b.Attr)] = html.Attribute{Key: "class", Val: class}
		b.Attr = ca
	}
}
Example #15
0
File: deploy.go Project: gdb/Stout
func addFiles(form uint8, parent *html.Node, files []string) {
	for _, file := range files {
		node := html.Node{
			Type: html.ElementNode,
		}
		switch form {
		case SCRIPT:
			node.Data = "script"
			node.Attr = []html.Attribute{
				html.Attribute{
					Key: "src",
					Val: file,
				},
			}

		case STYLE:
			node.Data = "link"
			node.Attr = []html.Attribute{
				html.Attribute{
					Key: "rel",
					Val: "stylesheet",
				},
				html.Attribute{
					Key: "href",
					Val: file,
				},
			}
		default:
			panic("Type not understood")
		}

		parent.AppendChild(&node)
	}
}
Example #16
0
// Splits the html.TextNode "node" into two nodes: one that holds
// Data[:splitBefore], and one that holds Data[splitAfter:]. "node"
// is modified in place to be the first result node; the second node
// is the return value.
func splitTextNode(node *html.Node, splitBefore, splitAfter int) *html.Node {
	newNode := &html.Node{
		Type: html.TextNode,
		Data: node.Data[splitAfter:],
	}
	node.Data = node.Data[:splitBefore]
	node.Parent.InsertBefore(newNode, node.NextSibling)
	return newNode
}
Example #17
0
// text-node
// <a>
// <img> <object> <embed> <video> <audio>
// <ul> <ol> <form> <textarea> <input> will be reserved
func flatten_block_node(b *html.Node, article *html.Node, flatt bool, class string) {
	cur_class := cat_class(b, class)
	switch {
	case b.Data == "form" || b.Data == "inputbox" || b.Data == "textarea":
	case flatt && is_unflatten_node(b):
		nb := create_element(b.Data)
		//		try_update_class_attr(nb, cur_class)
		flatten_block_node(b, nb, false, class)
		article.AppendChild(nb)
	case hasInlineNodes(b):
		p := create_p(b)
		//		try_update_class_attr(p, cur_class)
		article.AppendChild(p)
	default:
		foreach_child(b, func(child *html.Node) {
			flatten_block_node(child, article, true, cur_class)
		})
	}
}
Example #18
0
func processNode(node *html.Node) (err error) {
	var stackTags [16]openTag
	tags := stackTags[:0]

	n := node.FirstChild
	for n != nil {
		var next, newParent *html.Node

		next = n.NextSibling
		if l := len(tags); l != 0 {
			newParent = tags[l-1].node
		}

		switch n.Type {
		case html.TextNode:
			if tags, next, err = processTextNode(n, tags); err != nil {
				return
			}
		case html.ElementNode:
			if err = processNode(n); err != nil {
				return
			}
		default:
			// Other node types are just ignored.
		}

		// reparent the active node if necessary
		if newParent != nil {
			node.RemoveChild(n)
			newParent.AppendChild(n)
		}

		n = next
	}

	if len(tags) != 0 {
		err = fmt.Errorf("shortcodes still open at end of surrounding HTML tag: %+v", tags)
	}

	return
}
Example #19
0
//CleanHtml 清洗掉所有的link/style/css
// 删除/html/head
// 转换所有的tag为小写字母
// 找到body/article节点
// 找到h1节点或者h2节点,根据数目设置body
func (cleaner *HtmlCleaner) CleanHtml(root *html.Node) {
	var (
		dropping []*html.Node = []*html.Node{}
	)
	cleaner.clean_unprintable_element(&dropping, root)

	for _, drop := range dropping {
		p := drop.Parent
		p.RemoveChild(drop)
	}

	if cleaner.head != nil {
		cleaner.head.Parent.RemoveChild(cleaner.head)
	}

	//文档中如果只有一个h1,通常这个h1所在的div就是文档内容
	if len(cleaner.header1s) == 1 { // only one h1
		ab := find_article_via_header_i(cleaner.header1s[0])
		cleaner.try_update_article(ab)
	}
	//如果文档中只有一个h2,这时又没有h1,h2就是其中的标题,所在的div就是文档内容
	if len(cleaner.header1s) == 0 && len(cleaner.header2s) == 1 {
		ab := find_article_via_header_i(cleaner.header2s[0])
		cleaner.try_update_article(ab)
	}

	if cleaner.Article == nil {

		cleaner.Article = &html.Node{Type: html.ElementNode,
			DataAtom: atom.Body,
			Data:     "body"}
		root.AppendChild(cleaner.Article)
	}
	cleaner.try_catch_phpwnd()
	cleaner.fix_forms()

	cleaner.clean_body()

	cleaner.clean_empty_nodes(cleaner.Article)
	cleaner.clean_attributes(cleaner.Article)
}
Example #20
0
// Remove all attributes on the provided node
// that are not contained within this whitelist
func (w *Whitelist) sanitizeAttributes(n *html.Node) {
	attributes := make([]html.Attribute, len(n.Attr))

	i := 0
	for _, attribute := range n.Attr {
		if w.HasAttributeForElement(n.Data, attribute.Key) {
			attributes[i] = attribute
			i += 1
		}
	}
	n.Attr = attributes[0:i]

}
Example #21
0
// sanitizeUnwrap traverses pre-order over the nodes, reattaching
// the whitelisted children of any element nodes that are not
// whitelisted to the parent of the unwhitelisted node
func (w *Whitelist) sanitizeUnwrap(n *html.Node) error {
	return w.sanitizeNode(n, func(n *html.Node) bool {
		if w.HasElement(n.Data) || n.Parent == nil {
			return true
		}

		insertBefore := n.NextSibling
		firstChild := n.FirstChild
		for c := n.FirstChild; c != nil; {
			nodeToUnwrap := c
			c = c.NextSibling

			n.RemoveChild(nodeToUnwrap)
			n.Parent.InsertBefore(nodeToUnwrap, insertBefore)
		}
		n.Parent.RemoveChild(n)

		// reset next sibling to support continuation
		// of linked-list style traversal of parent node's children
		n.NextSibling = firstChild
		return false
	})
}
Example #22
0
// reserve id, class, href, src
func (this *HtmlCleaner) clean_attributes(n *html.Node) {
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		this.clean_attributes(child)
	}
	attrs := []html.Attribute{}
	for _, attr := range n.Attr {
		if attr.Key == "id" || attr.Key == "class" || attr.Key == "href" || attr.Key == "src" {
			attrs = append(attrs, attr)
		}
	}
	if len(attrs) != len(n.Attr) {
		n.Attr = attrs
	}
}
Example #23
0
func setAttributeValue(attrName string, val string, n *html.Node) {
	if n == nil {
		return
	}

	for i := range n.Attr {
		if attr := &n.Attr[i]; attr.Key == attrName {
			attr.Val = val
			return
		}
	}

	n.Attr = append(n.Attr, html.Attribute{Key: attrName, Val: val})
}
Example #24
0
func trim_invisible_image(img *html.Node) (drop bool) {
	width, werr := strconv.ParseInt(node_get_attribute(img, "width"), 0, 32)
	height, herr := strconv.ParseInt(node_get_attribute(img, "height"), 0, 32)

	if werr != nil || herr != nil || img.Parent == nil {
		return
	}
	// set width height explicit zero
	if width == 0 || height == 0 {
		img.Data = "input"
		drop = true
	}
	return
}
Example #25
0
//整理html文档,将block-level/inline-level混合的节点改成只有block-level的节点
//对已只有inline-level的节点,删除行前后的空白符
//将包含inline-level的节点展开成更为简单的形式,去掉想<font><span><strong>等等格式节点
func (this *html_cleaner) clean_block_node(n *html.Node) {
	blks := node_has_block_children(n)
	inlines := node_has_inline_children(n)

	// has bocks and inlines
	if blks && inlines {
		child := n.FirstChild
		for child != nil {
			if node_is_inline(child) {
				p := child.PrevSibling
				if p == nil || p.Data != "p" {
					p = create_element("p")
					n.InsertBefore(p, child)
				}
				n.RemoveChild(child)
				p.AppendChild(child)
				child = p.NextSibling
			} else {
				child = child.NextSibling
			}
		}
		inlines = false
	}

	// only inlines
	if blks == false && inlines {
		this.clean_inline_node(n)
		this.trim_empty_spaces(n)
	}

	// only blocks
	if blks && !inlines {
		for child := n.FirstChild; child != nil; child = child.NextSibling {
			this.clean_block_node(child)
		}
	}
}
Example #26
0
// CloneNode makes a copy of a Node with all descendants.
func CloneNode(n *exphtml.Node) *exphtml.Node {
	clone := new(exphtml.Node)
	clone.Type = n.Type
	clone.DataAtom = n.DataAtom
	clone.Data = n.Data
	clone.Attr = make([]exphtml.Attribute, len(n.Attr))
	copy(clone.Attr, n.Attr)
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		nc := CloneNode(c)
		clone.AppendChild(nc)
	}
	return clone
}
Example #27
0
// reserve id, class, href, src, width, height, alt
// class,id会用于后面正文内容的判定
// width/height/alt会用于判定image时候是正文
func (this *html_cleaner) clean_attributes(n *html.Node) {
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		this.clean_attributes(child)
	}
	var attrs []html.Attribute
	for _, attr := range n.Attr {
		switch attr.Key {
		case "id", "class", "href", "src", "width", "height", "alt":
			attrs = append(attrs, attr)
		}
	}
	if len(attrs) != len(n.Attr) {
		n.Attr = attrs
	}
}
Example #28
0
// sanitizeRemove traverses pre-order over the nodes,
// removing any element nodes that are not whitelisted
// and and removing any attributes that are not whitelisted
// from a given element node
func (w *Whitelist) sanitizeRemove(n *html.Node) error {
	return w.sanitizeNode(n, func(n *html.Node) bool {
		if !w.HasElement(n.Data) {
			if n.Parent != nil {
				nextSibling := n.NextSibling
				n.Parent.RemoveChild(n)

				// reset next sibling to support continuation
				// of linked-list style traversal of parent node's children
				n.NextSibling = nextSibling
			}
			return false
		}
		return true
	})
}
Example #29
0
// text-node
// <a>
// <img> <object> <embed> <video> <audio>
// <ul> <ol> <form> <textarea> <input> will be reserved
func flatten_block_node(b *html.Node, article *html.Node, flatt bool, class string) {
	cur_class := node_cat_class(b, class)
	switch {
	case node_is_media(b):
		mp := create_p_with_child(b)
		article.AppendChild(mp)
	case flatt && node_is_unflatten(b): // make unflatten nodes flatted
		nb := create_element(b.Data)
		//		try_update_class_attr(nb, cur_class)
		flatten_block_node(b, nb, false, class)
		article.AppendChild(nb)
	case node_is_unflatten(b):
	case node_has_inline_children(b):
		p := create_p_with_child(b)
		//		try_update_class_attr(p, cur_class)
		article.AppendChild(p)
	default:
		foreach_child(b, func(child *html.Node) {
			flatten_block_node(child, article, true, cur_class)
		})
	}
}
Example #30
0
File: cleaner.go Project: ngs/GoOse
func (this *cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document {
	if this.config.debug {
		log.Println("Starting to replace bad divs...")
	}
	badDivs := 0
	convertedTextNodes := 0
	divs := doc.Find(domType)
	tags := []string{"a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"}

	divs.Each(func(i int, div *goquery.Selection) {
		if this.config.parser.getElementsByTags(div, tags).Size() == 0 {
			this.replaceWithPara(div)
			badDivs++
		} else {
			replacementText := make([]string, 0)
			nodesToRemove := list.New()
			children := div.Contents()
			if this.config.debug {
				log.Printf("Found %d children of div\n", children.Size())
			}
			children.EachWithBreak(func(i int, kid *goquery.Selection) bool {
				text := kid.Text()
				kidNode := kid.Get(0)
				tag := kidNode.Data
				if tag == text {
					tag = "#text"
				}
				if tag == "#text" {
					text = strings.Replace(text, "\n", "", -1)
					text = tabsRegEx.ReplaceAllString(text, "")
					if text == "" {
						return true
					}
					if len(text) > 1 {
						prev := kidNode.PrevSibling
						if this.config.debug {
							log.Printf("PARENT CLASS: %s NODENAME: %s\n", this.config.parser.name("class", div), tag)
							log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1))
						}
						if prev != nil && prev.DataAtom == atom.A {
							nodeSelection := kid.HasNodes(prev)
							html, _ := nodeSelection.Html()
							replacementText = append(replacementText, html)
							if this.config.debug {
								log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html)
							}
						}
						replacementText = append(replacementText, text)
						nodesToRemove.PushBack(kidNode)
						convertedTextNodes++
					}

				}
				return true
			})

			newNode := new(html.Node)
			newNode.Type = html.ElementNode
			newNode.Data = strings.Join(replacementText, "")
			newNode.DataAtom = atom.P
			div.First().AddNodes(newNode)

			for s := nodesToRemove.Front(); s != nil; s = s.Next() {
				node := s.Value.(*html.Node)
				if node != nil && node.Parent != nil {
					node.Parent.RemoveChild(node)
				}
			}
		}
	})
	if this.config.debug {
		log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes)
	}
	return doc

}