func addFiles(form uint8, parent *html.Node, files []string) { for _, file := range files { node := html.Node{ Type: html.ElementNode, } switch form { case SCRIPT: node.Data = "script" node.Attr = []html.Attribute{ html.Attribute{ Key: "src", Val: file, }, } case STYLE: node.Data = "link" node.Attr = []html.Attribute{ html.Attribute{ Key: "rel", Val: "stylesheet", }, html.Attribute{ Key: "href", Val: file, }, } default: panic("Type not understood") } parent.AppendChild(&node) } }
// CloneNode makes a copy of a Node with all descendants. func CloneNode(n *exphtml.Node) *exphtml.Node { clone := new(exphtml.Node) clone.Type = n.Type clone.DataAtom = n.DataAtom clone.Data = n.Data clone.Attr = make([]exphtml.Attribute, len(n.Attr)) copy(clone.Attr, n.Attr) for c := n.FirstChild; c != nil; c = c.NextSibling { nc := CloneNode(c) clone.AppendChild(nc) } return clone }
func html_clean_root(root *html.Node, uribase string) (*html.Node, []*html.Node) { cleaner := &html_cleaner{} cleaner.current_url, _ = url.Parse(uribase) cleaner.html_drop_unprintable(root) cleaner.remove_head() var ( h1l = len(cleaner.header1s) h2l = len(cleaner.header2s) h3l = len(cleaner.header3s) h4l = len(cleaner.header4s) ) alter := false //文档中如果只有一个h1,通常这个h1所在的div就是文档内容 if h1l == 1 { // only one h1 ab := find_article_via_header_i(cleaner.header1s[0]) alter = cleaner.try_update_article(ab) if !alter && cleaner.title_similar(cleaner.header1s[0].Data) { alter = true cleaner.article = ab } } //如果文档中只有一个h2,这时又没有h1,h2就是其中的标题,所在的div就是文档内容 if h1l == 0 && h2l == 1 { ab := find_article_via_header_i(cleaner.header2s[0]) alter = alter || cleaner.try_update_article(ab) } if alter == false && h3l == 1 { ab := find_article_via_header_i(cleaner.header3s[0]) alter = alter || cleaner.try_update_article(ab) } if alter == false && h4l == 1 { ab := find_article_via_header_i(cleaner.header4s[0]) alter = alter || cleaner.try_update_article(ab) } if cleaner.article == nil { cleaner.article = &html.Node{Type: html.ElementNode, DataAtom: atom.Body, Data: "body"} root.AppendChild(cleaner.article) } cleaner.fix_forms() // may alter form to div, so do this before try_catch_phpwind cleaner.try_catch_phpwnd() cleaner.clean_body() cleaner.clean_empty_nodes(cleaner.article) cleaner.clean_attributes(cleaner.article) return cleaner.article, cleaner.iframes }
// flatten inlines text image a object video audio seq // n is element-node // inline node may have div element func (this *html_cleaner) clean_inline_node(n *html.Node) { inlines := this.flatten_inline_node(n) for child := n.FirstChild; child != nil; child = n.FirstChild { n.RemoveChild(child) } for _, inline := range inlines { p := inline.Parent if p != nil { p.RemoveChild(inline) // this.article.RemoveChild(child) } n.AppendChild(inline) } }
func node_append_children(src *html.Node, target *html.Node) { foreach_child(src, func(child *html.Node) { switch { case child.Type == html.TextNode: target.AppendChild(create_text(child.Data)) case child.Data == "a" || node_is_object(child): // ommit all children elements a := shallow_clone_element(child) node_append_children(child, a) target.AppendChild(a) default: node_append_children(child, target) } }) }
// text-node // <a> // <img> <object> <embed> <video> <audio> // <ul> <ol> <form> <textarea> <input> will be reserved func flatten_block_node(b *html.Node, article *html.Node, flatt bool, class string) { cur_class := cat_class(b, class) switch { case b.Data == "form" || b.Data == "inputbox" || b.Data == "textarea": case flatt && is_unflatten_node(b): nb := create_element(b.Data) // try_update_class_attr(nb, cur_class) flatten_block_node(b, nb, false, class) article.AppendChild(nb) case hasInlineNodes(b): p := create_p(b) // try_update_class_attr(p, cur_class) article.AppendChild(p) default: foreach_child(b, func(child *html.Node) { flatten_block_node(child, article, true, cur_class) }) } }
//CleanHtml 清洗掉所有的link/style/css // 删除/html/head // 转换所有的tag为小写字母 // 找到body/article节点 // 找到h1节点或者h2节点,根据数目设置body func (cleaner *HtmlCleaner) CleanHtml(root *html.Node) { var ( dropping []*html.Node = []*html.Node{} ) cleaner.clean_unprintable_element(&dropping, root) for _, drop := range dropping { p := drop.Parent p.RemoveChild(drop) } if cleaner.head != nil { cleaner.head.Parent.RemoveChild(cleaner.head) } //文档中如果只有一个h1,通常这个h1所在的div就是文档内容 if len(cleaner.header1s) == 1 { // only one h1 ab := find_article_via_header_i(cleaner.header1s[0]) cleaner.try_update_article(ab) } //如果文档中只有一个h2,这时又没有h1,h2就是其中的标题,所在的div就是文档内容 if len(cleaner.header1s) == 0 && len(cleaner.header2s) == 1 { ab := find_article_via_header_i(cleaner.header2s[0]) cleaner.try_update_article(ab) } if cleaner.Article == nil { cleaner.Article = &html.Node{Type: html.ElementNode, DataAtom: atom.Body, Data: "body"} root.AppendChild(cleaner.Article) } cleaner.try_catch_phpwnd() cleaner.fix_forms() cleaner.clean_body() cleaner.clean_empty_nodes(cleaner.Article) cleaner.clean_attributes(cleaner.Article) }
func processNode(node *html.Node) (err error) { var stackTags [16]openTag tags := stackTags[:0] n := node.FirstChild for n != nil { var next, newParent *html.Node next = n.NextSibling if l := len(tags); l != 0 { newParent = tags[l-1].node } switch n.Type { case html.TextNode: if tags, next, err = processTextNode(n, tags); err != nil { return } case html.ElementNode: if err = processNode(n); err != nil { return } default: // Other node types are just ignored. } // reparent the active node if necessary if newParent != nil { node.RemoveChild(n) newParent.AppendChild(n) } n = next } if len(tags) != 0 { err = fmt.Errorf("shortcodes still open at end of surrounding HTML tag: %+v", tags) } return }
// text-node // <a> // <img> <object> <embed> <video> <audio> // <ul> <ol> <form> <textarea> <input> will be reserved func flatten_block_node(b *html.Node, article *html.Node, flatt bool, class string) { cur_class := node_cat_class(b, class) switch { case node_is_media(b): mp := create_p_with_child(b) article.AppendChild(mp) case flatt && node_is_unflatten(b): // make unflatten nodes flatted nb := create_element(b.Data) // try_update_class_attr(nb, cur_class) flatten_block_node(b, nb, false, class) article.AppendChild(nb) case node_is_unflatten(b): case node_has_inline_children(b): p := create_p_with_child(b) // try_update_class_attr(p, cur_class) article.AppendChild(p) default: foreach_child(b, func(child *html.Node) { flatten_block_node(child, article, true, cur_class) }) } }
// CreationDate returns the time an HTML document was created. // // It also returns a FileInfo for the document, with the time added in the // header if it was missing. The bool returned is true the meta creation // element has been added to the header. func CreationDate(path string) (*FileInfo, bool, error) { title := "" f, err := os.Open(path) if err != nil { return nil, false, err } defer f.Close() stat, err := f.Stat() if err != nil { return nil, false, err } doc, err := html.Parse(f) if err != nil { return nil, false, err } hasMeta := false var head *html.Node var found func(*html.Node) var created time.Time found = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "head" { head = n } if n.Type == html.ElementNode && n.Data == "title" { for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.TextNode { title = title + c.Data } } } if n.Type == html.ElementNode && n.Data == "meta" { name, err := getAttrByName(n, "name") if err == nil { value, err := getAttrByName(n, "value") if err == nil && name == "created" { created, err = time.Parse(format, value) if err != nil { created, err = time.Parse(format_no_tz, value) if err == nil { hasMeta = true } } else { hasMeta = true } } } } for c := n.FirstChild; c != nil; c = c.NextSibling { found(c) } } found(doc) if !hasMeta { now := time.Now() meta := &html.Node{ Type: html.ElementNode, Data: "meta", Attr: []html.Attribute{ {Key: "value", Val: now.Format(format)}, {Key: "name", Val: "created"}, }} head.AppendChild(meta) created = now } fi := &FileInfo{ Path: path, Node: doc, Title: title, Created: created, Updated: stat.ModTime(), } return fi, !hasMeta, nil }