func Partial(r io.Reader) ([]*html.Node, error) { b := &html.Node{} b.Data = "body" b.DataAtom = atom.Body b.Type = html.ElementNode return html.ParseFragment(r, b) }
// GoodAsText applies some heuristics to make the data look good when displayed // as simple text. For example, if the data is escaped HTML then other heuristics are // applied to remove the HTML. For example if the data contains an HTML image tag, // goodAsText will return the alt text. If nothing good is found then an empty slice is // returned. func goodAsText(d []byte) []byte { unesc := html.UnescapeString(string(d)) nodes, err := html.ParseFragment(strings.NewReader(unesc), bodyNode) if err != nil { log.Printf("failed to parse [%s] as HTML: %v", unesc, err) return d } var buf bytes.Buffer for _, root := range nodes { walk(root, func(n *html.Node) { if n.Type == html.TextNode { buf.WriteString(strings.TrimSpace(n.Data)) return } if n := buf.Len(); n > 0 && buf.Bytes()[n-1] != ' ' { buf.WriteString(" ") } if n.DataAtom == atom.Img { if alt := altTextOrEmpty(n); alt != "" { buf.WriteString(alt) } } }) } return buf.Bytes() }
func strip_html_tags(htm string) string { reader := strings.NewReader(htm) root := &html.Node{Type: html.ElementNode, Data: "article", DataAtom: atom.Article} frags, _ := html.ParseFragment(reader, root) var txt string for _, f := range frags { txt += extract_html_text(f) } return txt }
func (t *minionTransport) ProcessResponse(req *http.Request, resp *http.Response) (*http.Response, error) { body, err := ioutil.ReadAll(resp.Body) if err != nil { // copying the response body did not work return nil, err } bodyNode := &html.Node{ Type: html.ElementNode, Data: "body", DataAtom: atom.Body, } nodes, err := html.ParseFragment(bytes.NewBuffer(body), bodyNode) if err != nil { glog.Errorf("Failed to found <body> node: %v", err) return resp, err } // Define the method to traverse the doc tree and update href node to // point to correct minion var updateHRef func(*html.Node) updateHRef = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for i, attr := range n.Attr { if attr.Key == "href" { Url := &url.URL{ Path: "/proxy/minion/" + req.URL.Host + req.URL.Path + attr.Val, } n.Attr[i].Val = Url.String() break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { updateHRef(c) } } newContent := &bytes.Buffer{} for _, n := range nodes { updateHRef(n) err = html.Render(newContent, n) if err != nil { glog.Errorf("Failed to render: %v", err) } } resp.Body = ioutil.NopCloser(newContent) // Update header node with new content-length // TODO: Remove any hash/signature headers here? resp.Header.Del("Content-Length") resp.ContentLength = int64(newContent.Len()) return resp, err }
func TestCleaner(t *testing.T) { root := &html.Node{Type: html.ElementNode, Data: "article", DataAtom: atom.Article} frag := `<p><embed src="http://player.56.com/v_MTAwODQ1MzE2.swf/1030_justin0842.swf" type="application/x-shockwave-flash" width="480" height="405" allowfullscreen="true" allownetworking="all" allowscriptaccess="always"></embed></p> <a href="http://juetuzhi.net/2013/11/jue-tu-9055.html#comments" style="background-color:#FFFFCC; color:#CC0000;" title="看看大家都说了些什么"><strong>已有 0 人发表评论,猛击这里参与讨论</strong></a>。 <hr style="height:1px;"> © 2007-2011 <a href="http://juetuzhi.net/" target="_blank">掘图志</a> | <a href="http://feeds.juetuzhi.net/" target="_blank" title="订阅掘图志,精彩图片送上门">订阅</a> | <a href="http://juetuzhi.net/2013/11/jue-tu-9055.html" target="_blank" title="本文链接">本文链接</a> | <a href="http://weibo.com/geuro" title="关注掘图志的新浪微博">新浪微博</a> | <a href="http://t.qq.com/juetuzhi" title="关注掘图志的腾讯微博">腾讯微博</a><br /><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0" style="clear: both;"> <tr> <td colspan="5"><b><font size="-1" style="display: block !important; padding: 20px 0 5px !important;">您也许还喜欢:</font></b></td> </tr> <tr> <td width="126" valign="top" style="padding: 5px !important; margin: 0 !important;"> <a target="_blank" title="[视频]年度最佳光棍节歌曲!看,这就是好男人的下场!" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect?url=http%3A%2F%2Fjuetuzhi.net%2F2011%2F09%2Fhao-nan-ren-de-xia-chang.html&from=http%3A%2F%2Fjuetuzhi.net%2F2013%2F11%2Fjue-tu-9055.html"> <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 120px !important; height: 120px !important;" src="http://static.wumii.cn/site_images/ti/jDuuAFV4.jpg?i=MG2m21AQ" width="120px" height="120px" /><br /> <font size="-1" style="display: block !important; line-height: 15px !important; width: 126px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">[视频]年度最佳光棍节歌曲!看,这就是好男人的下场!</font> </a> </td> <td width="126" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;"> <a target="_blank" title="你就一辈子做宅男吧!" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect?url=http%3A%2F%2Fjuetuzhi.net%2F2012%2F06%2Flei-ren-88.html&from=http%3A%2F%2Fjuetuzhi.net%2F2013%2F11%2Fjue-tu-9055.html"> <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 120px !important; height: 120px !important;" src="http://static.wumii.cn/resources/images/related_item_default/31.jpg" width="120px" height="120px" /><br /> <font size="-1" style="display: block !important; line-height: 15px !important; width: 126px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">你就一辈子做宅男吧!</font> </a> </td> <td width="126" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;"> <a target="_blank" title="好萌的小车!" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect?url=http%3A%2F%2Fjuetuzhi.net%2F2012%2F09%2Flei-ren-761.html&from=http%3A%2F%2Fjuetuzhi.net%2F2013%2F11%2Fjue-tu-9055.html"> <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 120px !important; height: 120px !important;" src="http://static.wumii.cn/resources/images/related_item_default/18.jpg" width="120px" height="120px" /><br /> <font size="-1" style="display: block !important; line-height: 15px !important; width: 126px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">好萌的小车!</font> </a> </td> <td width="126" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;"> <a target="_blank" title="好萌的牌子" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect?url=http%3A%2F%2Fjuetuzhi.net%2F2013%2F10%2Fjue-tu-8007.html&from=http%3A%2F%2Fjuetuzhi.net%2F2013%2F11%2Fjue-tu-9055.html"> <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 120px !important; height: 120px !important;" src="http://static.wumii.cn/site_images/ti/8cEhGbW.jpg?i=2VfiBF1v" width="120px" height="120px" /><br /> <font size="-1" style="display: block !important; line-height: 15px !important; width: 126px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">好萌的牌子</font> </a> </td> <td width="126" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;"> <a target="_blank" title="好!有骨气!!!" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect?url=http%3A%2F%2Fjuetuzhi.net%2F2012%2F09%2Flei-ren-733.html&from=http%3A%2F%2Fjuetuzhi.net%2F2013%2F11%2Fjue-tu-9055.html"> <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 120px !important; height: 120px !important;" src="http://static.wumii.cn/site_images/ti/Omwy91sa.jpg?i=CfjL4Eba" width="120px" height="120px" /><br /> <font size="-1" style="display: block !important; line-height: 15px !important; width: 126px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">好!有骨气!!!</font> </a> </td> </tr> <tr> <td colspan="5" align="right"> <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems" target="_blank" title="无觅关联推荐"> <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font> </a> </td> </tr> </table><img width='1' height='1' src='http://juetuzhi.feedsportal.com/c/34719/f/638620/s/33ccfe03/sc/17/mf.gif' border='0'/><br clear='all'/><br/><br/><a href="http://da.feedsportal.com/r/180264446218/u/0/f/638620/c/34719/s/33ccfe03/a2.htm"><img src="http://da.feedsportal.com/r/180264446218/u/0/f/638620/c/34719/s/33ccfe03/a2.img" border="0"/></a><img width="1" height="1" src="http://pi.feedsportal.com/r/180264446218/u/0/f/638620/c/34719/s/33ccfe03/a2t.img" border="0"/>` nodes, _ := html.ParseFragment(strings.NewReader(frag), root) for _, n := range nodes { root.AppendChild(n) } article, _, _ := NewExtractor("").MakeFragmentReadable(root) // log.Println(sum) print_html_doc(article) }
func html_create_fragment(fulltext string) (*html.Node, error) { reader := strings.NewReader(fulltext) v := &html.Node{Type: html.ElementNode, Data: "article", DataAtom: atom.Article} frags, err := html.ParseFragment(reader, v) if err != nil { return v, err } for _, frag := range frags { v.AppendChild(frag) } return v, err }
func tidyHtml(input []byte) ([]byte, error) { // tidy nodes, err := html.ParseFragment(bytes.NewReader(input), nil) if err != nil { return nil, err } buf := new(bytes.Buffer) for _, node := range nodes { err = html.Render(buf, node) if err != nil { return nil, err } } return buf.Bytes(), nil }
// GetBodyNode returns an BODY node nested within an HTML node. func getBodyNode() *html.Node { ns, err := html.ParseFragment(strings.NewReader("<html><body></body></html>"), nil) if err != nil { panic("error generating context") } if len(ns) == 0 { panic("no nodes generating context") } h := ns[0] if h.Type != html.ElementNode || h.DataAtom != atom.Html { panic("expected an HTML node, got " + pretty.String(h)) } b := h.LastChild if b.Type != html.ElementNode || b.DataAtom != atom.Body { panic("expected a BODY node, got " + pretty.String(b)) } return b }
func ConvertHtmlToMarkdown(in []byte, rewriteUrl UrlRewriter) ([]byte, error) { // parse it! body := &html.Node{ Type: html.ElementNode, DataAtom: atom.Body, Data: "body", } reader := bytes.NewReader(in) elems, err := html.ParseFragment(reader, body) if err != nil { return nil, err } if reader.Len() != 0 { return nil, errors.New("Post couldn't be fully parsed!") } // stuff it all into the body node so we have a proper tree. for _, elem := range elems { body.AppendChild(elem) } // process shortcodes and WP-LaTeX markup. if err = shortcode.ProcessShortcodes(body); err != nil { return nil, err } shortcode.ProcessWpLatex(body) // render it back wr := &writer{RewriteUrl: rewriteUrl} for elem := body.FirstChild; elem != nil; elem = elem.NextSibling { err = renderElement(wr, elem, -1) if err != nil { return nil, err } } wr.handleDelayedLf() return wr.Bytes(), nil }
func (c *Context) htmlParseFragment(call otto.FunctionCall) otto.Value { fragment := call.Argument(0).String() var ctx *html.Node arg1, _ := call.Argument(1).Export() if c, ok := arg1.(*node); ok { ctx = c.node } nodes, err := html.ParseFragment(strings.NewReader(fragment), ctx) if err != nil { c.Errorf("error parsing HTML fragment: %s\n", err) return otto.Value{} } values := make([]*node, len(nodes)) for ii, v := range nodes { values[ii] = asNode(v, c.vm) } val, err := c.vm.ToValue(values) if err != nil { panic(err) } return val }
func ParseHTML5(filename string) ([]*html.Node, error) { reader, err := os.Open(filename) frag, err := html.ParseFragment(reader, bodyNode) return frag, err }