Example #1
0
func Partial(r io.Reader) ([]*html.Node, error) {
	b := &html.Node{}
	b.Data = "body"
	b.DataAtom = atom.Body
	b.Type = html.ElementNode
	return html.ParseFragment(r, b)
}
Example #2
0
// GoodAsText applies some heuristics to make the data look good when displayed
// as simple text. For example, if the data is escaped HTML then other heuristics are
// applied to remove the HTML. For example if the data contains an HTML image tag,
// goodAsText will return the alt text. If nothing good is found then an empty slice is
// returned.
func goodAsText(d []byte) []byte {
	unesc := html.UnescapeString(string(d))
	nodes, err := html.ParseFragment(strings.NewReader(unesc), bodyNode)
	if err != nil {
		log.Printf("failed to parse [%s] as HTML: %v", unesc, err)
		return d
	}

	var buf bytes.Buffer
	for _, root := range nodes {
		walk(root, func(n *html.Node) {
			if n.Type == html.TextNode {
				buf.WriteString(strings.TrimSpace(n.Data))
				return
			}

			if n := buf.Len(); n > 0 && buf.Bytes()[n-1] != ' ' {
				buf.WriteString(" ")
			}
			if n.DataAtom == atom.Img {
				if alt := altTextOrEmpty(n); alt != "" {
					buf.WriteString(alt)
				}
			}
		})
	}
	return buf.Bytes()
}
Example #3
0
func strip_html_tags(htm string) string {
	reader := strings.NewReader(htm)
	root := &html.Node{Type: html.ElementNode, Data: "article", DataAtom: atom.Article}
	frags, _ := html.ParseFragment(reader, root)
	var txt string
	for _, f := range frags {
		txt += extract_html_text(f)
	}
	return txt
}
Example #4
0
func (t *minionTransport) ProcessResponse(req *http.Request, resp *http.Response) (*http.Response, error) {
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		// copying the response body did not work
		return nil, err
	}

	bodyNode := &html.Node{
		Type:     html.ElementNode,
		Data:     "body",
		DataAtom: atom.Body,
	}
	nodes, err := html.ParseFragment(bytes.NewBuffer(body), bodyNode)
	if err != nil {
		glog.Errorf("Failed to found <body> node: %v", err)
		return resp, err
	}

	// Define the method to traverse the doc tree and update href node to
	// point to correct minion
	var updateHRef func(*html.Node)
	updateHRef = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for i, attr := range n.Attr {
				if attr.Key == "href" {
					Url := &url.URL{
						Path: "/proxy/minion/" + req.URL.Host + req.URL.Path + attr.Val,
					}
					n.Attr[i].Val = Url.String()
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			updateHRef(c)
		}
	}

	newContent := &bytes.Buffer{}
	for _, n := range nodes {
		updateHRef(n)
		err = html.Render(newContent, n)
		if err != nil {
			glog.Errorf("Failed to render: %v", err)
		}
	}

	resp.Body = ioutil.NopCloser(newContent)
	// Update header node with new content-length
	// TODO: Remove any hash/signature headers here?
	resp.Header.Del("Content-Length")
	resp.ContentLength = int64(newContent.Len())

	return resp, err
}
Example #5
0
func TestCleaner(t *testing.T) {
	root := &html.Node{Type: html.ElementNode, Data: "article", DataAtom: atom.Article}
	frag := `<p><embed src="http://player.56.com/v_MTAwODQ1MzE2.swf/1030_justin0842.swf" type="application/x-shockwave-flash" width="480" height="405" allowfullscreen="true" allownetworking="all" allowscriptaccess="always"></embed></p> <a href="http://juetuzhi.net/2013/11/jue-tu-9055.html#comments" style="background-color:#FFFFCC; color:#CC0000;" title="看看大家都说了些什么"><strong>已有 0 人发表评论,猛击这里参与讨论</strong></a>。 <hr style="height:1px;"> © 2007-2011 <a href="http://juetuzhi.net/" target="_blank">掘图志</a> | <a href="http://feeds.juetuzhi.net/" target="_blank" title="订阅掘图志,精彩图片送上门">订阅</a> | <a href="http://juetuzhi.net/2013/11/jue-tu-9055.html" target="_blank" title="本文链接">本文链接</a> | <a href="http://weibo.com/geuro" title="关注掘图志的新浪微博">新浪微博</a> | <a href="http://t.qq.com/juetuzhi" title="关注掘图志的腾讯微博">腾讯微博</a><br /><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0" style="clear: both;"> <tr> <td colspan="5"><b><font size="-1" style="display: block !important; padding: 20px 0 5px !important;">您也许还喜欢:</font></b></td> </tr> <tr> <td width="126" valign="top" style="padding: 5px !important; margin: 0 !important;"> <a target="_blank" title="[视频]年度最佳光棍节歌曲!看,这就是好男人的下场!" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect?url=http%3A%2F%2Fjuetuzhi.net%2F2011%2F09%2Fhao-nan-ren-de-xia-chang.html&from=http%3A%2F%2Fjuetuzhi.net%2F2013%2F11%2Fjue-tu-9055.html"> <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 120px !important; height: 120px !important;" src="http://static.wumii.cn/site_images/ti/jDuuAFV4.jpg?i=MG2m21AQ" width="120px" height="120px" /><br /> <font size="-1" style="display: block !important; line-height: 15px !important; width: 126px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">[视频]年度最佳光棍节歌曲!看,这就是好男人的下场!</font> </a> </td> <td width="126" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;"> <a target="_blank" title="你就一辈子做宅男吧!" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect?url=http%3A%2F%2Fjuetuzhi.net%2F2012%2F06%2Flei-ren-88.html&from=http%3A%2F%2Fjuetuzhi.net%2F2013%2F11%2Fjue-tu-9055.html"> <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 120px !important; height: 120px !important;" src="http://static.wumii.cn/resources/images/related_item_default/31.jpg" width="120px" height="120px" /><br /> <font size="-1" style="display: block !important; line-height: 15px !important; width: 126px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">你就一辈子做宅男吧!</font> </a> </td> <td width="126" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;"> <a target="_blank" title="好萌的小车!" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect?url=http%3A%2F%2Fjuetuzhi.net%2F2012%2F09%2Flei-ren-761.html&from=http%3A%2F%2Fjuetuzhi.net%2F2013%2F11%2Fjue-tu-9055.html"> <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 120px !important; height: 120px !important;" src="http://static.wumii.cn/resources/images/related_item_default/18.jpg" width="120px" height="120px" /><br /> <font size="-1" style="display: block !important; line-height: 15px !important; width: 126px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">好萌的小车!</font> </a> </td> <td width="126" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;"> <a target="_blank" title="好萌的牌子" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect?url=http%3A%2F%2Fjuetuzhi.net%2F2013%2F10%2Fjue-tu-8007.html&from=http%3A%2F%2Fjuetuzhi.net%2F2013%2F11%2Fjue-tu-9055.html"> <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 120px !important; height: 120px !important;" src="http://static.wumii.cn/site_images/ti/8cEhGbW.jpg?i=2VfiBF1v" width="120px" height="120px" /><br /> <font size="-1" style="display: block !important; line-height: 15px !important; width: 126px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">好萌的牌子</font> </a> </td> <td width="126" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;"> <a target="_blank" title="好!有骨气!!!" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect?url=http%3A%2F%2Fjuetuzhi.net%2F2012%2F09%2Flei-ren-733.html&from=http%3A%2F%2Fjuetuzhi.net%2F2013%2F11%2Fjue-tu-9055.html"> <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 120px !important; height: 120px !important;" src="http://static.wumii.cn/site_images/ti/Omwy91sa.jpg?i=CfjL4Eba" width="120px" height="120px" /><br /> <font size="-1" style="display: block !important; line-height: 15px !important; width: 126px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">好!有骨气!!!</font> </a> </td> </tr> <tr> <td colspan="5" align="right"> <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems" target="_blank" title="无觅关联推荐"> <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font> </a> </td> </tr> </table><img width='1' height='1' src='http://juetuzhi.feedsportal.com/c/34719/f/638620/s/33ccfe03/sc/17/mf.gif' border='0'/><br clear='all'/><br/><br/><a href="http://da.feedsportal.com/r/180264446218/u/0/f/638620/c/34719/s/33ccfe03/a2.htm"><img src="http://da.feedsportal.com/r/180264446218/u/0/f/638620/c/34719/s/33ccfe03/a2.img" border="0"/></a><img width="1" height="1" src="http://pi.feedsportal.com/r/180264446218/u/0/f/638620/c/34719/s/33ccfe03/a2t.img" border="0"/>`
	nodes, _ := html.ParseFragment(strings.NewReader(frag), root)
	for _, n := range nodes {
		root.AppendChild(n)
	}
	article, _, _ := NewExtractor("").MakeFragmentReadable(root)
	//	log.Println(sum)
	print_html_doc(article)
}
Example #6
0
func html_create_fragment(fulltext string) (*html.Node, error) {
	reader := strings.NewReader(fulltext)

	v := &html.Node{Type: html.ElementNode, Data: "article", DataAtom: atom.Article}
	frags, err := html.ParseFragment(reader, v)
	if err != nil {
		return v, err
	}
	for _, frag := range frags {
		v.AppendChild(frag)
	}
	return v, err
}
Example #7
0
func tidyHtml(input []byte) ([]byte, error) {
	// tidy
	nodes, err := html.ParseFragment(bytes.NewReader(input), nil)
	if err != nil {
		return nil, err
	}
	buf := new(bytes.Buffer)
	for _, node := range nodes {
		err = html.Render(buf, node)
		if err != nil {
			return nil, err
		}
	}
	return buf.Bytes(), nil
}
Example #8
0
// GetBodyNode returns an BODY node nested within an HTML node.
func getBodyNode() *html.Node {
	ns, err := html.ParseFragment(strings.NewReader("<html><body></body></html>"), nil)
	if err != nil {
		panic("error generating context")
	}
	if len(ns) == 0 {
		panic("no nodes generating context")
	}
	h := ns[0]
	if h.Type != html.ElementNode || h.DataAtom != atom.Html {
		panic("expected an HTML node, got " + pretty.String(h))
	}
	b := h.LastChild
	if b.Type != html.ElementNode || b.DataAtom != atom.Body {
		panic("expected a BODY node, got " + pretty.String(b))
	}
	return b
}
Example #9
0
func ConvertHtmlToMarkdown(in []byte, rewriteUrl UrlRewriter) ([]byte, error) {
	// parse it!
	body := &html.Node{
		Type:     html.ElementNode,
		DataAtom: atom.Body,
		Data:     "body",
	}

	reader := bytes.NewReader(in)
	elems, err := html.ParseFragment(reader, body)
	if err != nil {
		return nil, err
	}
	if reader.Len() != 0 {
		return nil, errors.New("Post couldn't be fully parsed!")
	}

	// stuff it all into the body node so we have a proper tree.
	for _, elem := range elems {
		body.AppendChild(elem)
	}

	// process shortcodes and WP-LaTeX markup.
	if err = shortcode.ProcessShortcodes(body); err != nil {
		return nil, err
	}
	shortcode.ProcessWpLatex(body)

	// render it back
	wr := &writer{RewriteUrl: rewriteUrl}
	for elem := body.FirstChild; elem != nil; elem = elem.NextSibling {
		err = renderElement(wr, elem, -1)
		if err != nil {
			return nil, err
		}
	}
	wr.handleDelayedLf()

	return wr.Bytes(), nil
}
Example #10
0
func (c *Context) htmlParseFragment(call otto.FunctionCall) otto.Value {
	fragment := call.Argument(0).String()
	var ctx *html.Node
	arg1, _ := call.Argument(1).Export()
	if c, ok := arg1.(*node); ok {
		ctx = c.node
	}
	nodes, err := html.ParseFragment(strings.NewReader(fragment), ctx)
	if err != nil {
		c.Errorf("error parsing HTML fragment: %s\n", err)
		return otto.Value{}
	}
	values := make([]*node, len(nodes))
	for ii, v := range nodes {
		values[ii] = asNode(v, c.vm)
	}
	val, err := c.vm.ToValue(values)
	if err != nil {
		panic(err)
	}
	return val
}
Example #11
0
File: ui.go Project: tav/oldproto
func ParseHTML5(filename string) ([]*html.Node, error) {
	reader, err := os.Open(filename)
	frag, err := html.ParseFragment(reader, bodyNode)
	return frag, err
}