Beispiel #1
0
// CleanHTML ...
func CleanHTML(htm string) string {
	root := clean_html(htm)
	var writer = &bytes.Buffer{}
	err := html.Render(writer, root)
	panic_error(err)
	return writer.String()
}
Beispiel #2
0
// pruneFiltered phrase-scans children of n that match fpr.Selector, and adds
// to toDelete those that should be removed according to fpr.Threshold and acls.
func (c *config) pruneFiltered(n *html.Node, fpr filteredPruningRule, acls map[string]bool, toDelete map[*html.Node]bool) {
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		if toDelete[child] {
			continue
		}

		remove := false
		if fpr.Selector.Selector(child) {
			buf := new(bytes.Buffer)
			html.Render(buf, child)
			tally := make(map[rule]int)
			c.scanContent(buf.Bytes(), "text/html", "utf-8", tally)
			scores := c.categoryScores(tally)
			categories := significantCategories(scores, fpr.Threshold)
			rule, _ := c.ChooseACLCategoryAction(acls, categories, "allow", "block", "block-invisible")
			remove = rule.Action == "block" || rule.Action == "block-invisible"
		}

		if remove {
			toDelete[child] = true
		} else {
			c.pruneFiltered(child, fpr, acls, toDelete)
		}
	}
}
Beispiel #3
0
// FixHtml parses bytes as HTML and returns well-formed HTML if the parse
// was successful, or escaped HTML, if not.
func fixHtml(linkUrl string, wild []byte) (well []byte) {
	n, err := html.Parse(bytes.NewReader(wild))
	if err != nil {
		return []byte(html.EscapeString(string(wild)))
	}

	fixImgs(linkUrl, n)

	defer func() {
		if err := recover(); err == bytes.ErrTooLarge {
			well = []byte(html.EscapeString(string(wild)))
		} else if err != nil {
			panic(err)
		}
	}()
	buf := bytes.NewBuffer(make([]byte, 0, len(wild)*2))
	if err := html.Render(buf, n); err != nil {
		return []byte(html.EscapeString(string(wild)))
	}

	well = buf.Bytes()
	openBody := []byte("<body>")
	i := bytes.Index(well, openBody)
	if i < 0 {
		return []byte(html.EscapeString(string(wild)))
	}
	well = well[i+len(openBody):]

	closeBody := []byte("</body>")
	i = bytes.Index(well, closeBody)
	if i < 0 {
		return []byte(html.EscapeString(string(wild)))
	}
	return well[:i]
}
Beispiel #4
0
func (p Cleanup) Process(f parser.Feed) parser.Feed {
	p.logger.Infof("Cleaning up feed '%s'\n", f.Title)

	for i := range f.Articles {
		f.Articles[i].Description = strings.TrimSpace(f.Articles[i].Description)

		if nodes, err := html.ParseFragment(strings.NewReader(f.Articles[i].Description), nil); err == nil {
			if nodesCleanup(nodes) {
				if len(nodes) == 0 {
					break
				}

				buf := util.BufferPool.GetBuffer()
				defer util.BufferPool.Put(buf)

				for _, n := range nodes {
					err = html.Render(buf, n)
					if err != nil {
						break
					}
				}

				content := buf.String()

				// net/http tries to provide valid html, adding html, head and body tags
				content = content[strings.Index(content, "<body>")+6 : strings.LastIndex(content, "</body>")]
				f.Articles[i].Description = content
			}
		}
	}

	return f
}
Beispiel #5
0
// StrFromNodes returns the string of the rendered html.Nodes.
func StrFromNodes(nodes []*html.Node) string {
	buf := bytes.NewBuffer([]byte{})
	for _, h := range nodes {
		html.Render(buf, h)
	}
	return buf.String()
}
Beispiel #6
0
func getArticle(data []byte) string {
	r := bytes.NewReader(data)
	doc, _ := html.Parse(r)
	// Tags
	doc = removeNegativeCandidates(doc)
	doc = removeNegativeMatches(doc)
	doc = getBodyElement(doc)
	// Attributes
	doc = removeNegativeAttributeMatches(doc)
	doc, _ = retriveMainRole(doc)
	doc = removeNonMainContent(doc)
	doc = clearClassesAndIDs(doc)
	c := calcContent(doc)
	var buff bytes.Buffer
	html.Render(&buff, doc)
	articlestr := buff.String()
	articlestr = utils.RemoveNewLines(articlestr)
	articlestr = utils.ReplaceTabsWithASpace(articlestr)
	articlestr = utils.TrimSpaces(articlestr)
	if float64(c)/float64(len(articlestr)) < 0.2 {
		// At least 20% of the article should be text
		return ""
	}
	return articlestr
}
Beispiel #7
0
func loadXpath(response *http.Response, xpath string) ([]byte, error) {
	body, err := ioutil.ReadAll(response.Body)
	panicError(err)

	// Parse body to see if login worked
	//	reader := strings.NewReader(body)
	root, err := html.Parse(bytes.NewBuffer(body))
	if err != nil {
		return nil, err
	}

	var b bytes.Buffer
	html.Render(&b, root)
	fixedHtml := b

	//	body = bytes.NewReader(fixedHtml)
	xmlroot, xmlerr := xmlpath.ParseHTML(bytes.NewReader(fixedHtml.Bytes()))

	if xmlerr != nil {
		return nil, xmlerr
	}

	path := xmlpath.MustCompile(xpath)
	if value, ok := path.Bytes(xmlroot); ok {
		return value, nil
	}

	return nil, errors.New("Could not find xpath")
}
func getRootNode(urlToOpen string) *xmlpath.Node {
	req, err := http.NewRequest("GET", urlToOpen, nil)
	if err != nil {
		log("error1: " + err.Error())
	} else {
		response, err := http.DefaultTransport.RoundTrip(req)
		if err != nil {
			log("Error 3: " + err.Error())
		} else {
			defer response.Body.Close()
			content, err := ioutil.ReadAll(response.Body)
			if err != nil {
				log("Error 2: " + err.Error())
			} else {
				root, err := html.Parse(bytes.NewReader(content))

				if err != nil {
					log("Parse error: " + err.Error())
				}

				var b bytes.Buffer
				html.Render(&b, root)
				fixedHtml := b.String()
				reader := strings.NewReader(fixedHtml)
				rootNode, err := xmlpath.ParseHTML(reader)
				if err != nil {
					log("Error 4: " + err.Error())
				} else {
					return rootNode
				}
			}
		}
	}
	return nil
}
//strip the <!DOCTYPE html> and <html> tag
func stripHTML(filename string) error {
	targetHTML, err := os.OpenFile(filename, os.O_RDWR, 0666)
	if err != nil {
		return err
	}
	defer targetHTML.Close()
	target, err := html.Parse(targetHTML)
	if err != nil {
		return err
	}
	//node for <head>
	headNode := target.FirstChild.NextSibling.FirstChild

	err = targetHTML.Truncate(0)
	if err != nil {
		return err
	}
	_, err = targetHTML.Seek(0, 0)
	if err != nil {
		return err
	}
	for node := headNode; node != nil; node = node.NextSibling {
		err = html.Render(targetHTML, node)
		if err != nil {
			return err
		}
	}
	return nil
}
func Test2(t *testing.T) {

	lg, lge := loghttp.Logger(nil, nil)

	doc, err := html.Parse(strings.NewReader(testDocs[0]))
	if err != nil {
		lge(err)
		return
	}
	removeCommentsAndIntertagWhitespace(NdX{doc, 0})

	breakoutImagesFromAnchorTrees(doc)

	removeCommentsAndIntertagWhitespace(NdX{doc, 0})
	reIndent(doc, 0)
	var b bytes.Buffer
	err = html.Render(&b, doc)
	lge(err)
	if b.String() != testDocs[1] {
		t.Errorf("output unexpted")
	}

	osutilpb.Bytes2File("outp1_inp.html", []byte(testDocs[0]))
	osutilpb.Dom2File("outp2_got.html", doc)
	osutilpb.Bytes2File("outp3_want.html", []byte(testDocs[1]))

	lg("end")

}
Beispiel #11
0
func SpoonerizeHTML(r io.Reader, extraHTML string) io.ReadCloser {
	doc, _ := html.Parse(r)
	var f func(*html.Node)
	f = func(n *html.Node) {
		switch n.Type {
		case html.TextNode:
			n.Data = string(Spoonerize([]byte(n.Data)))
		case html.ElementNode:
			switch n.DataAtom {
			case atom.Style, atom.Script:
				return
			}
		}

		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}

		if n.DataAtom == atom.Body {
			if extraHTML != "" {
				nodes, _ := html.ParseFragment(bytes.NewBufferString(extraHTML), n)
				for _, node := range nodes {
					n.AppendChild(node)
				}
			}
		}
	}
	f(doc)

	d := &bufferCloser{}
	html.Render(d, doc)
	return d
}
Beispiel #12
0
// RenderClean renders the provided HTML node and returns it as a string. It is
// a convenience function for html.Render.
func RenderClean(node *html.Node) (htmlStr string, err error) {
	w := new(bytes.Buffer)
	err = html.Render(w, node)
	if err != nil {
		return "", err
	}
	return string(w.Bytes()), nil
}
Beispiel #13
0
func createIndexMinHTMLFile(document *html.Node, dir string) {
	wrtr, err := os.Create(filepath.Join(dir, "index.min.html"))
	if err != nil {
		log.Fatalf("Error: could not open file for write: %v", err)
	}
	defer closeFile(wrtr, true)
	html.Render(wrtr, document)
}
Beispiel #14
0
func (this *EpubMaker) saveChapter(root *html.Node, chapters []Chapter) {
	if !this.blank {
		buf := new(bytes.Buffer)
		html.Render(buf, root)
		this.book.AddChapter(chapters, buf.Bytes())
		this.blank = true
	}
}
Beispiel #15
0
// Dom2File writes DOM to file
func Dom2File(fn string, node *html.Node) {
	lg, _ := loghttp.BuffLoggerUniversal(nil, nil)

	var b bytes.Buffer
	err := html.Render(&b, node)
	lg(err)
	Bytes2File(fn, b.Bytes())
}
Beispiel #16
0
func renderNode(node *html.Node) (string, error) {
	var buf bytes.Buffer
	err := html.Render(&buf, node)
	if err != nil {
		return "", err
	}
	return buf.String(), nil
}
Beispiel #17
0
// Render nodes to a writer.
// nil nodes are skipped.
func (s Siblings) Render(w io.Writer) error {
	doc := &html.Node{
		Type: html.DocumentNode,
	}
	first, last := s.convert(doc)
	doc.FirstChild = first
	doc.LastChild = last
	return html.Render(w, doc)
}
Beispiel #18
0
func renderTree(node *html.Node) (string, error) {
	var b bytes.Buffer
	err := html.Render(&b, node)
	if err != nil {
		return "", err
	}

	return b.String(), nil
}
Beispiel #19
0
func buildPlain(n *html.Node) (s string, err error) {
	buf := new(bytes.Buffer)
	err = html.Render(buf, n)
	if err != nil {
		return
	}
	s = buf.String()
	return
}
Beispiel #20
0
func RenderNodes(w io.Writer, ns []*html.Node) error {
	for _, n := range ns {
		err := html.Render(w, n)
		if err != nil {
			return err
		}
	}
	return nil
}
Beispiel #21
0
// Render is a convenience function that wraps html.Render and renders to a
// string instead of an io.Writer.
func Render(nodes ...*html.Node) string {
	var buf bytes.Buffer

	for _, n := range nodes {
		err := html.Render(&buf, n)
		expectError(err, nil)
	}

	return string(buf.Bytes())
}
Beispiel #22
0
// RenderComponents renders components into HTML, writing result to w.
// Context-aware escaping is done just like in html/template when rendering nodes.
func RenderComponents(w io.Writer, components ...Component) error {
	for _, c := range components {
		for _, node := range c.Render() {
			err := html.Render(w, node)
			if err != nil {
				return err
			}
		}
	}
	return nil
}
Beispiel #23
0
// RenderComponentsContext renders components into HTML, writing result to w.
// Context-aware escaping is done just like in html/template when rendering nodes.
//
// RenderComponentsContext is experimental and may be changed or removed.
func RenderComponentsContext(ctx context.Context, w io.Writer, components ...ComponentContext) error {
	for _, c := range components {
		for _, node := range c.RenderContext(ctx) {
			err := html.Render(w, node)
			if err != nil {
				return err
			}
		}
	}
	return nil
}
Beispiel #24
0
// fmtHTML parses and re-emits 'in', effectively canonicalizing it.
func fmtHTML(in string) string {
	doc, err := html.Parse(strings.NewReader(in))
	if err != nil {
		panic(err)
	}
	out := &bytes.Buffer{}
	if err := html.Render(out, doc); err != nil {
		panic(err)
	}
	return string(out.Bytes())
}
func getTagName(node *html.Node) string {
	orphanNode := &html.Node{
		Type:      node.Type,
		DataAtom:  node.DataAtom,
		Data:      node.Data,
		Namespace: node.Namespace,
		Attr:      node.Attr,
	}
	var buffer bytes.Buffer
	html.Render(&buffer, orphanNode)
	return buffer.String()
}
Beispiel #26
0
// OuterHtml returns the outer HTML rendering of the first item in
// the selection - that is, the HTML including the first element's
// tag and attributes.
//
// Unlike InnerHtml, this is a function and not a method on the Selection,
// because this is not a jQuery method (in javascript-land, this is
// a property provided by the DOM).
func OuterHtml(s *Selection) (string, error) {
	var buf bytes.Buffer

	if s.Length() == 0 {
		return "", nil
	}
	n := s.Get(0)
	if err := html.Render(&buf, n); err != nil {
		return "", err
	}
	return buf.String(), nil
}
Beispiel #27
0
func writeHTML(orig, dest string, root *html.Node) error {
	dir := filepath.Dir(orig)
	base := filepath.Base(orig)
	os.MkdirAll(filepath.Join(dest, dir), 0755)
	out, err := os.Create(filepath.Join(dest, dir, base))
	if err != nil {
		return err
	}
	defer out.Close()

	return html.Render(out, root)
}
Beispiel #28
0
func Fuzz(data []byte) int {
	nodes, err := html.ParseFragment(bytes.NewReader(data), nil)
	if err != nil {
		return 0
	}
	for _, n := range nodes {
		if err := html.Render(ioutil.Discard, n); err != nil {
			panic(err)
		}
	}
	return 1
}
Beispiel #29
0
func (n *Node) Render() *string {
	if n == nil {
		return nil
	}
	var b bytes.Buffer
	err := html.Render(&b, &n.n)
	if err != nil {
		return nil
	}
	s := b.String()
	return &s
}
Beispiel #30
0
// Render renders HTML nodes, returning result as template.HTML.
// Context-aware escaping is done just like in html/template when rendering nodes.
//
// TODO: Return string instead of template.HTML; returning template.HTML has proven to be unhelpful (since so many consumers expect a simple string).
func Render(nodes ...*html.Node) template.HTML {
	var buf bytes.Buffer
	for _, node := range nodes {
		err := html.Render(&buf, node)
		if err != nil {
			// html.Render should only return a non-nil error if there's a problem writing to the supplied io.Writer.
			// We don't expect that to ever be the case (unless there's not enough memory), so panic.
			// If this ever happens in other situations, it's a bug in this library that should be reported and fixed.
			panic(fmt.Errorf("internal error: html.Render returned non-nil error, this is not expected to happen: %v", err))
		}
	}
	return template.HTML(buf.String())
}