Пример #1
0
func addFiles(form uint8, parent *html.Node, files []string) {
	for _, file := range files {
		node := html.Node{
			Type: html.ElementNode,
		}
		switch form {
		case SCRIPT:
			node.Data = "script"
			node.Attr = []html.Attribute{
				html.Attribute{
					Key: "src",
					Val: file,
				},
			}

		case STYLE:
			node.Data = "link"
			node.Attr = []html.Attribute{
				html.Attribute{
					Key: "rel",
					Val: "stylesheet",
				},
				html.Attribute{
					Key: "href",
					Val: file,
				},
			}
		default:
			panic("Type not understood")
		}

		parent.AppendChild(&node)
	}
}
Пример #2
0
// CloneNode makes a copy of a Node with all descendants.
func CloneNode(n *exphtml.Node) *exphtml.Node {
	clone := new(exphtml.Node)
	clone.Type = n.Type
	clone.DataAtom = n.DataAtom
	clone.Data = n.Data
	clone.Attr = make([]exphtml.Attribute, len(n.Attr))
	copy(clone.Attr, n.Attr)
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		nc := CloneNode(c)
		clone.AppendChild(nc)
	}
	return clone
}
Пример #3
0
func html_clean_root(root *html.Node, uribase string) (*html.Node, []*html.Node) {
	cleaner := &html_cleaner{}
	cleaner.current_url, _ = url.Parse(uribase)
	cleaner.html_drop_unprintable(root)
	cleaner.remove_head()

	var (
		h1l = len(cleaner.header1s)
		h2l = len(cleaner.header2s)
		h3l = len(cleaner.header3s)
		h4l = len(cleaner.header4s)
	)
	alter := false
	//文档中如果只有一个h1,通常这个h1所在的div就是文档内容
	if h1l == 1 { // only one h1
		ab := find_article_via_header_i(cleaner.header1s[0])
		alter = cleaner.try_update_article(ab)
		if !alter && cleaner.title_similar(cleaner.header1s[0].Data) {
			alter = true
			cleaner.article = ab
		}
	}
	//如果文档中只有一个h2,这时又没有h1,h2就是其中的标题,所在的div就是文档内容
	if h1l == 0 && h2l == 1 {
		ab := find_article_via_header_i(cleaner.header2s[0])
		alter = alter || cleaner.try_update_article(ab)
	}
	if alter == false && h3l == 1 {
		ab := find_article_via_header_i(cleaner.header3s[0])
		alter = alter || cleaner.try_update_article(ab)
	}
	if alter == false && h4l == 1 {
		ab := find_article_via_header_i(cleaner.header4s[0])
		alter = alter || cleaner.try_update_article(ab)
	}
	if cleaner.article == nil {
		cleaner.article = &html.Node{Type: html.ElementNode,
			DataAtom: atom.Body,
			Data:     "body"}
		root.AppendChild(cleaner.article)
	}
	cleaner.fix_forms() // may alter form to div, so do this before try_catch_phpwind
	cleaner.try_catch_phpwnd()
	cleaner.clean_body()
	cleaner.clean_empty_nodes(cleaner.article)
	cleaner.clean_attributes(cleaner.article)

	return cleaner.article, cleaner.iframes
}
Пример #4
0
// flatten inlines text image a object video audio seq
// n is element-node
// inline node may have div element
func (this *html_cleaner) clean_inline_node(n *html.Node) {
	inlines := this.flatten_inline_node(n)

	for child := n.FirstChild; child != nil; child = n.FirstChild {
		n.RemoveChild(child)
	}
	for _, inline := range inlines {
		p := inline.Parent
		if p != nil {
			p.RemoveChild(inline) //			this.article.RemoveChild(child)

		}
		n.AppendChild(inline)
	}
}
Пример #5
0
func node_append_children(src *html.Node, target *html.Node) {
	foreach_child(src, func(child *html.Node) {
		switch {
		case child.Type == html.TextNode:
			target.AppendChild(create_text(child.Data))
		case child.Data == "a" || node_is_object(child):
			// ommit all children elements
			a := shallow_clone_element(child)
			node_append_children(child, a)
			target.AppendChild(a)
		default:
			node_append_children(child, target)
		}
	})
}
Пример #6
0
// text-node
// <a>
// <img> <object> <embed> <video> <audio>
// <ul> <ol> <form> <textarea> <input> will be reserved
func flatten_block_node(b *html.Node, article *html.Node, flatt bool, class string) {
	cur_class := cat_class(b, class)
	switch {
	case b.Data == "form" || b.Data == "inputbox" || b.Data == "textarea":
	case flatt && is_unflatten_node(b):
		nb := create_element(b.Data)
		//		try_update_class_attr(nb, cur_class)
		flatten_block_node(b, nb, false, class)
		article.AppendChild(nb)
	case hasInlineNodes(b):
		p := create_p(b)
		//		try_update_class_attr(p, cur_class)
		article.AppendChild(p)
	default:
		foreach_child(b, func(child *html.Node) {
			flatten_block_node(child, article, true, cur_class)
		})
	}
}
Пример #7
0
//CleanHtml 清洗掉所有的link/style/css
// 删除/html/head
// 转换所有的tag为小写字母
// 找到body/article节点
// 找到h1节点或者h2节点,根据数目设置body
func (cleaner *HtmlCleaner) CleanHtml(root *html.Node) {
	var (
		dropping []*html.Node = []*html.Node{}
	)
	cleaner.clean_unprintable_element(&dropping, root)

	for _, drop := range dropping {
		p := drop.Parent
		p.RemoveChild(drop)
	}

	if cleaner.head != nil {
		cleaner.head.Parent.RemoveChild(cleaner.head)
	}

	//文档中如果只有一个h1,通常这个h1所在的div就是文档内容
	if len(cleaner.header1s) == 1 { // only one h1
		ab := find_article_via_header_i(cleaner.header1s[0])
		cleaner.try_update_article(ab)
	}
	//如果文档中只有一个h2,这时又没有h1,h2就是其中的标题,所在的div就是文档内容
	if len(cleaner.header1s) == 0 && len(cleaner.header2s) == 1 {
		ab := find_article_via_header_i(cleaner.header2s[0])
		cleaner.try_update_article(ab)
	}

	if cleaner.Article == nil {

		cleaner.Article = &html.Node{Type: html.ElementNode,
			DataAtom: atom.Body,
			Data:     "body"}
		root.AppendChild(cleaner.Article)
	}
	cleaner.try_catch_phpwnd()
	cleaner.fix_forms()

	cleaner.clean_body()

	cleaner.clean_empty_nodes(cleaner.Article)
	cleaner.clean_attributes(cleaner.Article)
}
Пример #8
0
func processNode(node *html.Node) (err error) {
	var stackTags [16]openTag
	tags := stackTags[:0]

	n := node.FirstChild
	for n != nil {
		var next, newParent *html.Node

		next = n.NextSibling
		if l := len(tags); l != 0 {
			newParent = tags[l-1].node
		}

		switch n.Type {
		case html.TextNode:
			if tags, next, err = processTextNode(n, tags); err != nil {
				return
			}
		case html.ElementNode:
			if err = processNode(n); err != nil {
				return
			}
		default:
			// Other node types are just ignored.
		}

		// reparent the active node if necessary
		if newParent != nil {
			node.RemoveChild(n)
			newParent.AppendChild(n)
		}

		n = next
	}

	if len(tags) != 0 {
		err = fmt.Errorf("shortcodes still open at end of surrounding HTML tag: %+v", tags)
	}

	return
}
Пример #9
0
// text-node
// <a>
// <img> <object> <embed> <video> <audio>
// <ul> <ol> <form> <textarea> <input> will be reserved
func flatten_block_node(b *html.Node, article *html.Node, flatt bool, class string) {
	cur_class := node_cat_class(b, class)
	switch {
	case node_is_media(b):
		mp := create_p_with_child(b)
		article.AppendChild(mp)
	case flatt && node_is_unflatten(b): // make unflatten nodes flatted
		nb := create_element(b.Data)
		//		try_update_class_attr(nb, cur_class)
		flatten_block_node(b, nb, false, class)
		article.AppendChild(nb)
	case node_is_unflatten(b):
	case node_has_inline_children(b):
		p := create_p_with_child(b)
		//		try_update_class_attr(p, cur_class)
		article.AppendChild(p)
	default:
		foreach_child(b, func(child *html.Node) {
			flatten_block_node(child, article, true, cur_class)
		})
	}
}
Пример #10
0
// CreationDate returns the time an HTML document was created.
//
// It also returns a FileInfo for the document, with the time added in the
// header if it was missing. The bool returned is true the meta creation
// element has been added to the header.
func CreationDate(path string) (*FileInfo, bool, error) {
	title := ""
	f, err := os.Open(path)
	if err != nil {
		return nil, false, err
	}
	defer f.Close()
	stat, err := f.Stat()
	if err != nil {
		return nil, false, err
	}

	doc, err := html.Parse(f)
	if err != nil {
		return nil, false, err
	}
	hasMeta := false
	var head *html.Node
	var found func(*html.Node)
	var created time.Time
	found = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "head" {
			head = n
		}
		if n.Type == html.ElementNode && n.Data == "title" {
			for c := n.FirstChild; c != nil; c = c.NextSibling {
				if c.Type == html.TextNode {
					title = title + c.Data
				}
			}
		}
		if n.Type == html.ElementNode && n.Data == "meta" {
			name, err := getAttrByName(n, "name")
			if err == nil {
				value, err := getAttrByName(n, "value")
				if err == nil && name == "created" {
					created, err = time.Parse(format, value)
					if err != nil {
						created, err = time.Parse(format_no_tz, value)
						if err == nil {
							hasMeta = true
						}
					} else {
						hasMeta = true
					}
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			found(c)
		}
	}
	found(doc)

	if !hasMeta {
		now := time.Now()
		meta := &html.Node{
			Type: html.ElementNode,
			Data: "meta", Attr: []html.Attribute{
				{Key: "value", Val: now.Format(format)},
				{Key: "name", Val: "created"},
			}}
		head.AppendChild(meta)
		created = now
	}
	fi := &FileInfo{
		Path:    path,
		Node:    doc,
		Title:   title,
		Created: created,
		Updated: stat.ModTime(),
	}
	return fi, !hasMeta, nil
}