func (this *flowdocument_maker) convert_flowdocument(frag *html.Node) { if frag.Type == html.TextNode { return } ignore_children := false switch frag.Data { case "img": frag.Type = html.CommentNode node_clear_children(frag) frag.Attr = nil case "a": frag.Data = "Hyperlink" frag.Attr = extract_ahref_attr(frag.Attr) case "article": frag.Data = "FlowDocument" // set namespace dont work frag.Attr = []html.Attribute{html.Attribute{Key: "xmlns", Val: fdocns}} case "object", "video", "audio", "embed": frag.Type = html.CommentNode node_clear_children(frag) frag.Attr = nil case "p": fallthrough default: frag.Data = "Paragraph" frag.Attr = nil if this.first_paragraph == nil { this.first_paragraph = frag } } for child := frag.FirstChild; ignore_children == false && child != nil; child = child.NextSibling { this.convert_flowdocument(child) } }
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
func node_clear_children(frag *html.Node) { for child := frag.FirstChild; child != nil; { next := child.NextSibling frag.RemoveChild(child) child = next } }
func clean_element_before_header(body *html.Node, name string) { child := body.FirstChild for child != nil { if child.Type == html.ElementNode && child.Data != name { next := child.NextSibling body.RemoveChild(child) child = next } else { break } } }
func remove_decentant(n *html.Node, tag string) { child := n.FirstChild for child != nil { if child.Type == html.ElementNode && child.Data == tag { next := child.NextSibling n.RemoveChild(child) child = next } else { remove_decentant(child, tag) child = child.NextSibling } } }
func html_clean_root(root *html.Node, uribase string) (*html.Node, []*html.Node) { cleaner := &html_cleaner{} cleaner.current_url, _ = url.Parse(uribase) cleaner.html_drop_unprintable(root) cleaner.remove_head() var ( h1l = len(cleaner.header1s) h2l = len(cleaner.header2s) h3l = len(cleaner.header3s) h4l = len(cleaner.header4s) ) alter := false //文档中如果只有一个h1,通常这个h1所在的div就是文档内容 if h1l == 1 { // only one h1 ab := find_article_via_header_i(cleaner.header1s[0]) alter = cleaner.try_update_article(ab) if !alter && cleaner.title_similar(cleaner.header1s[0].Data) { alter = true cleaner.article = ab } } //如果文档中只有一个h2,这时又没有h1,h2就是其中的标题,所在的div就是文档内容 if h1l == 0 && h2l == 1 { ab := find_article_via_header_i(cleaner.header2s[0]) alter = alter || cleaner.try_update_article(ab) } if alter == false && h3l == 1 { ab := find_article_via_header_i(cleaner.header3s[0]) alter = alter || cleaner.try_update_article(ab) } if alter == false && h4l == 1 { ab := find_article_via_header_i(cleaner.header4s[0]) alter = alter || cleaner.try_update_article(ab) } if cleaner.article == nil { cleaner.article = &html.Node{Type: html.ElementNode, DataAtom: atom.Body, Data: "body"} root.AppendChild(cleaner.article) } cleaner.fix_forms() // may alter form to div, so do this before try_catch_phpwind cleaner.try_catch_phpwnd() cleaner.clean_body() cleaner.clean_empty_nodes(cleaner.article) cleaner.clean_attributes(cleaner.article) return cleaner.article, cleaner.iframes }
func node_append_children(src *html.Node, target *html.Node) { foreach_child(src, func(child *html.Node) { switch { case child.Type == html.TextNode: target.AppendChild(create_text(child.Data)) case child.Data == "a" || node_is_object(child): // ommit all children elements a := shallow_clone_element(child) node_append_children(child, a) target.AppendChild(a) default: node_append_children(child, target) } }) }
func trim_small_image(img *html.Node) (drop bool) { width, height, _ := media_get_dim(img) if img.Parent == nil { return } if width > 0 && height > 0 && width*height < small_image_t*small_image_t && img.Parent.Data == "a" { img.Data = "input" drop = true } else if width == 1 && height == 1 { img.Data = "input" drop = true } return }
// flatten inlines text image a object video audio seq // n is element-node // inline node may have div element func (this *html_cleaner) clean_inline_node(n *html.Node) { inlines := this.flatten_inline_node(n) for child := n.FirstChild; child != nil; child = n.FirstChild { n.RemoveChild(child) } for _, inline := range inlines { p := inline.Parent if p != nil { p.RemoveChild(inline) // this.article.RemoveChild(child) } n.AppendChild(inline) } }
//删除行前后空白 func (this *html_cleaner) trim_empty_spaces_func(n *html.Node, trim func(string) string) { child := n.FirstChild for child != nil { if child.Type == html.TextNode { child.Data = trim(child.Data) } else { this.trim_empty_spaces_func(child, trim) } if node_is_not_empty(child) { break } next := child.NextSibling n.RemoveChild(child) child = next } }
// The splitting process may leave TextNodes with no Data, which we keep // around to make the data manipulation simpler. This function removes // them. func cleanupTree(node *html.Node) { var next *html.Node for n := node.FirstChild; n != nil; n = next { next = n.NextSibling switch n.Type { case html.TextNode: if len(n.Data) == 0 { node.RemoveChild(n) } case html.ElementNode: cleanupTree(n) default: // ignore other node types. } } }
func trim_display_none(n *html.Node) { st := get_attribute(n, "style") if strings.Contains(st, "display") && (strings.Contains(st, "none")) { // log.Println("hide-node display:none", n.Data) n.Data = "input" } }
func processTextNode(node *html.Node, tags []openTag) (outTags []openTag, next *html.Node, err error) { i := 0 for i < len(node.Data) { r, rsize := utf8.DecodeRuneInString(node.Data[i:]) switch r { case '[': size, openClose, tag, rest := parseShortcode(node.Data[i+1:]) if size != 0 { // looks like we found a shortcode! if tag == "" { // escape code? // remove the outer [] and continue node.Data = node.Data[:i] + rest + node.Data[i+1+size:] i += len(rest) } else { return handleShortcode(node, tags, i, i+1+size, openClose, tag, rest) } } else { i += rsize } default: i += rsize } } // default: no shortcode found outTags = tags next = node.NextSibling err = nil return }
func try_update_class_attr(b *html.Node, class string) { if len(class) > 0 { ca := make([]html.Attribute, len(b.Attr)+1) copy(ca, b.Attr) ca[len(b.Attr)] = html.Attribute{Key: "class", Val: class} b.Attr = ca } }
func addFiles(form uint8, parent *html.Node, files []string) { for _, file := range files { node := html.Node{ Type: html.ElementNode, } switch form { case SCRIPT: node.Data = "script" node.Attr = []html.Attribute{ html.Attribute{ Key: "src", Val: file, }, } case STYLE: node.Data = "link" node.Attr = []html.Attribute{ html.Attribute{ Key: "rel", Val: "stylesheet", }, html.Attribute{ Key: "href", Val: file, }, } default: panic("Type not understood") } parent.AppendChild(&node) } }
// Splits the html.TextNode "node" into two nodes: one that holds // Data[:splitBefore], and one that holds Data[splitAfter:]. "node" // is modified in place to be the first result node; the second node // is the return value. func splitTextNode(node *html.Node, splitBefore, splitAfter int) *html.Node { newNode := &html.Node{ Type: html.TextNode, Data: node.Data[splitAfter:], } node.Data = node.Data[:splitBefore] node.Parent.InsertBefore(newNode, node.NextSibling) return newNode }
// text-node // <a> // <img> <object> <embed> <video> <audio> // <ul> <ol> <form> <textarea> <input> will be reserved func flatten_block_node(b *html.Node, article *html.Node, flatt bool, class string) { cur_class := cat_class(b, class) switch { case b.Data == "form" || b.Data == "inputbox" || b.Data == "textarea": case flatt && is_unflatten_node(b): nb := create_element(b.Data) // try_update_class_attr(nb, cur_class) flatten_block_node(b, nb, false, class) article.AppendChild(nb) case hasInlineNodes(b): p := create_p(b) // try_update_class_attr(p, cur_class) article.AppendChild(p) default: foreach_child(b, func(child *html.Node) { flatten_block_node(child, article, true, cur_class) }) } }
func processNode(node *html.Node) (err error) { var stackTags [16]openTag tags := stackTags[:0] n := node.FirstChild for n != nil { var next, newParent *html.Node next = n.NextSibling if l := len(tags); l != 0 { newParent = tags[l-1].node } switch n.Type { case html.TextNode: if tags, next, err = processTextNode(n, tags); err != nil { return } case html.ElementNode: if err = processNode(n); err != nil { return } default: // Other node types are just ignored. } // reparent the active node if necessary if newParent != nil { node.RemoveChild(n) newParent.AppendChild(n) } n = next } if len(tags) != 0 { err = fmt.Errorf("shortcodes still open at end of surrounding HTML tag: %+v", tags) } return }
//CleanHtml 清洗掉所有的link/style/css // 删除/html/head // 转换所有的tag为小写字母 // 找到body/article节点 // 找到h1节点或者h2节点,根据数目设置body func (cleaner *HtmlCleaner) CleanHtml(root *html.Node) { var ( dropping []*html.Node = []*html.Node{} ) cleaner.clean_unprintable_element(&dropping, root) for _, drop := range dropping { p := drop.Parent p.RemoveChild(drop) } if cleaner.head != nil { cleaner.head.Parent.RemoveChild(cleaner.head) } //文档中如果只有一个h1,通常这个h1所在的div就是文档内容 if len(cleaner.header1s) == 1 { // only one h1 ab := find_article_via_header_i(cleaner.header1s[0]) cleaner.try_update_article(ab) } //如果文档中只有一个h2,这时又没有h1,h2就是其中的标题,所在的div就是文档内容 if len(cleaner.header1s) == 0 && len(cleaner.header2s) == 1 { ab := find_article_via_header_i(cleaner.header2s[0]) cleaner.try_update_article(ab) } if cleaner.Article == nil { cleaner.Article = &html.Node{Type: html.ElementNode, DataAtom: atom.Body, Data: "body"} root.AppendChild(cleaner.Article) } cleaner.try_catch_phpwnd() cleaner.fix_forms() cleaner.clean_body() cleaner.clean_empty_nodes(cleaner.Article) cleaner.clean_attributes(cleaner.Article) }
// Remove all attributes on the provided node // that are not contained within this whitelist func (w *Whitelist) sanitizeAttributes(n *html.Node) { attributes := make([]html.Attribute, len(n.Attr)) i := 0 for _, attribute := range n.Attr { if w.HasAttributeForElement(n.Data, attribute.Key) { attributes[i] = attribute i += 1 } } n.Attr = attributes[0:i] }
// sanitizeUnwrap traverses pre-order over the nodes, reattaching // the whitelisted children of any element nodes that are not // whitelisted to the parent of the unwhitelisted node func (w *Whitelist) sanitizeUnwrap(n *html.Node) error { return w.sanitizeNode(n, func(n *html.Node) bool { if w.HasElement(n.Data) || n.Parent == nil { return true } insertBefore := n.NextSibling firstChild := n.FirstChild for c := n.FirstChild; c != nil; { nodeToUnwrap := c c = c.NextSibling n.RemoveChild(nodeToUnwrap) n.Parent.InsertBefore(nodeToUnwrap, insertBefore) } n.Parent.RemoveChild(n) // reset next sibling to support continuation // of linked-list style traversal of parent node's children n.NextSibling = firstChild return false }) }
// reserve id, class, href, src func (this *HtmlCleaner) clean_attributes(n *html.Node) { for child := n.FirstChild; child != nil; child = child.NextSibling { this.clean_attributes(child) } attrs := []html.Attribute{} for _, attr := range n.Attr { if attr.Key == "id" || attr.Key == "class" || attr.Key == "href" || attr.Key == "src" { attrs = append(attrs, attr) } } if len(attrs) != len(n.Attr) { n.Attr = attrs } }
func setAttributeValue(attrName string, val string, n *html.Node) { if n == nil { return } for i := range n.Attr { if attr := &n.Attr[i]; attr.Key == attrName { attr.Val = val return } } n.Attr = append(n.Attr, html.Attribute{Key: attrName, Val: val}) }
func trim_invisible_image(img *html.Node) (drop bool) { width, werr := strconv.ParseInt(node_get_attribute(img, "width"), 0, 32) height, herr := strconv.ParseInt(node_get_attribute(img, "height"), 0, 32) if werr != nil || herr != nil || img.Parent == nil { return } // set width height explicit zero if width == 0 || height == 0 { img.Data = "input" drop = true } return }
//整理html文档,将block-level/inline-level混合的节点改成只有block-level的节点 //对已只有inline-level的节点,删除行前后的空白符 //将包含inline-level的节点展开成更为简单的形式,去掉想<font><span><strong>等等格式节点 func (this *html_cleaner) clean_block_node(n *html.Node) { blks := node_has_block_children(n) inlines := node_has_inline_children(n) // has bocks and inlines if blks && inlines { child := n.FirstChild for child != nil { if node_is_inline(child) { p := child.PrevSibling if p == nil || p.Data != "p" { p = create_element("p") n.InsertBefore(p, child) } n.RemoveChild(child) p.AppendChild(child) child = p.NextSibling } else { child = child.NextSibling } } inlines = false } // only inlines if blks == false && inlines { this.clean_inline_node(n) this.trim_empty_spaces(n) } // only blocks if blks && !inlines { for child := n.FirstChild; child != nil; child = child.NextSibling { this.clean_block_node(child) } } }
// CloneNode makes a copy of a Node with all descendants. func CloneNode(n *exphtml.Node) *exphtml.Node { clone := new(exphtml.Node) clone.Type = n.Type clone.DataAtom = n.DataAtom clone.Data = n.Data clone.Attr = make([]exphtml.Attribute, len(n.Attr)) copy(clone.Attr, n.Attr) for c := n.FirstChild; c != nil; c = c.NextSibling { nc := CloneNode(c) clone.AppendChild(nc) } return clone }
// reserve id, class, href, src, width, height, alt // class,id会用于后面正文内容的判定 // width/height/alt会用于判定image时候是正文 func (this *html_cleaner) clean_attributes(n *html.Node) { for child := n.FirstChild; child != nil; child = child.NextSibling { this.clean_attributes(child) } var attrs []html.Attribute for _, attr := range n.Attr { switch attr.Key { case "id", "class", "href", "src", "width", "height", "alt": attrs = append(attrs, attr) } } if len(attrs) != len(n.Attr) { n.Attr = attrs } }
// sanitizeRemove traverses pre-order over the nodes, // removing any element nodes that are not whitelisted // and and removing any attributes that are not whitelisted // from a given element node func (w *Whitelist) sanitizeRemove(n *html.Node) error { return w.sanitizeNode(n, func(n *html.Node) bool { if !w.HasElement(n.Data) { if n.Parent != nil { nextSibling := n.NextSibling n.Parent.RemoveChild(n) // reset next sibling to support continuation // of linked-list style traversal of parent node's children n.NextSibling = nextSibling } return false } return true }) }
// text-node // <a> // <img> <object> <embed> <video> <audio> // <ul> <ol> <form> <textarea> <input> will be reserved func flatten_block_node(b *html.Node, article *html.Node, flatt bool, class string) { cur_class := node_cat_class(b, class) switch { case node_is_media(b): mp := create_p_with_child(b) article.AppendChild(mp) case flatt && node_is_unflatten(b): // make unflatten nodes flatted nb := create_element(b.Data) // try_update_class_attr(nb, cur_class) flatten_block_node(b, nb, false, class) article.AppendChild(nb) case node_is_unflatten(b): case node_has_inline_children(b): p := create_p_with_child(b) // try_update_class_attr(p, cur_class) article.AppendChild(p) default: foreach_child(b, func(child *html.Node) { flatten_block_node(child, article, true, cur_class) }) } }
func (this *cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document { if this.config.debug { log.Println("Starting to replace bad divs...") } badDivs := 0 convertedTextNodes := 0 divs := doc.Find(domType) tags := []string{"a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"} divs.Each(func(i int, div *goquery.Selection) { if this.config.parser.getElementsByTags(div, tags).Size() == 0 { this.replaceWithPara(div) badDivs++ } else { replacementText := make([]string, 0) nodesToRemove := list.New() children := div.Contents() if this.config.debug { log.Printf("Found %d children of div\n", children.Size()) } children.EachWithBreak(func(i int, kid *goquery.Selection) bool { text := kid.Text() kidNode := kid.Get(0) tag := kidNode.Data if tag == text { tag = "#text" } if tag == "#text" { text = strings.Replace(text, "\n", "", -1) text = tabsRegEx.ReplaceAllString(text, "") if text == "" { return true } if len(text) > 1 { prev := kidNode.PrevSibling if this.config.debug { log.Printf("PARENT CLASS: %s NODENAME: %s\n", this.config.parser.name("class", div), tag) log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1)) } if prev != nil && prev.DataAtom == atom.A { nodeSelection := kid.HasNodes(prev) html, _ := nodeSelection.Html() replacementText = append(replacementText, html) if this.config.debug { log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html) } } replacementText = append(replacementText, text) nodesToRemove.PushBack(kidNode) convertedTextNodes++ } } return true }) newNode := new(html.Node) newNode.Type = html.ElementNode newNode.Data = strings.Join(replacementText, "") newNode.DataAtom = atom.P div.First().AddNodes(newNode) for s := nodesToRemove.Front(); s != nil; s = s.Next() { node := s.Value.(*html.Node) if node != nil && node.Parent != nil { node.Parent.RemoveChild(node) } } } }) if this.config.debug { log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes) } return doc }