// CleanHTML ... func CleanHTML(htm string) string { root := clean_html(htm) var writer = &bytes.Buffer{} err := html.Render(writer, root) panic_error(err) return writer.String() }
// pruneFiltered phrase-scans children of n that match fpr.Selector, and adds // to toDelete those that should be removed according to fpr.Threshold and acls. func (c *config) pruneFiltered(n *html.Node, fpr filteredPruningRule, acls map[string]bool, toDelete map[*html.Node]bool) { for child := n.FirstChild; child != nil; child = child.NextSibling { if toDelete[child] { continue } remove := false if fpr.Selector.Selector(child) { buf := new(bytes.Buffer) html.Render(buf, child) tally := make(map[rule]int) c.scanContent(buf.Bytes(), "text/html", "utf-8", tally) scores := c.categoryScores(tally) categories := significantCategories(scores, fpr.Threshold) rule, _ := c.ChooseACLCategoryAction(acls, categories, "allow", "block", "block-invisible") remove = rule.Action == "block" || rule.Action == "block-invisible" } if remove { toDelete[child] = true } else { c.pruneFiltered(child, fpr, acls, toDelete) } } }
// FixHtml parses bytes as HTML and returns well-formed HTML if the parse // was successful, or escaped HTML, if not. func fixHtml(linkUrl string, wild []byte) (well []byte) { n, err := html.Parse(bytes.NewReader(wild)) if err != nil { return []byte(html.EscapeString(string(wild))) } fixImgs(linkUrl, n) defer func() { if err := recover(); err == bytes.ErrTooLarge { well = []byte(html.EscapeString(string(wild))) } else if err != nil { panic(err) } }() buf := bytes.NewBuffer(make([]byte, 0, len(wild)*2)) if err := html.Render(buf, n); err != nil { return []byte(html.EscapeString(string(wild))) } well = buf.Bytes() openBody := []byte("<body>") i := bytes.Index(well, openBody) if i < 0 { return []byte(html.EscapeString(string(wild))) } well = well[i+len(openBody):] closeBody := []byte("</body>") i = bytes.Index(well, closeBody) if i < 0 { return []byte(html.EscapeString(string(wild))) } return well[:i] }
func (p Cleanup) Process(f parser.Feed) parser.Feed { p.logger.Infof("Cleaning up feed '%s'\n", f.Title) for i := range f.Articles { f.Articles[i].Description = strings.TrimSpace(f.Articles[i].Description) if nodes, err := html.ParseFragment(strings.NewReader(f.Articles[i].Description), nil); err == nil { if nodesCleanup(nodes) { if len(nodes) == 0 { break } buf := util.BufferPool.GetBuffer() defer util.BufferPool.Put(buf) for _, n := range nodes { err = html.Render(buf, n) if err != nil { break } } content := buf.String() // net/http tries to provide valid html, adding html, head and body tags content = content[strings.Index(content, "<body>")+6 : strings.LastIndex(content, "</body>")] f.Articles[i].Description = content } } } return f }
// StrFromNodes returns the string of the rendered html.Nodes. func StrFromNodes(nodes []*html.Node) string { buf := bytes.NewBuffer([]byte{}) for _, h := range nodes { html.Render(buf, h) } return buf.String() }
func getArticle(data []byte) string { r := bytes.NewReader(data) doc, _ := html.Parse(r) // Tags doc = removeNegativeCandidates(doc) doc = removeNegativeMatches(doc) doc = getBodyElement(doc) // Attributes doc = removeNegativeAttributeMatches(doc) doc, _ = retriveMainRole(doc) doc = removeNonMainContent(doc) doc = clearClassesAndIDs(doc) c := calcContent(doc) var buff bytes.Buffer html.Render(&buff, doc) articlestr := buff.String() articlestr = utils.RemoveNewLines(articlestr) articlestr = utils.ReplaceTabsWithASpace(articlestr) articlestr = utils.TrimSpaces(articlestr) if float64(c)/float64(len(articlestr)) < 0.2 { // At least 20% of the article should be text return "" } return articlestr }
func loadXpath(response *http.Response, xpath string) ([]byte, error) { body, err := ioutil.ReadAll(response.Body) panicError(err) // Parse body to see if login worked // reader := strings.NewReader(body) root, err := html.Parse(bytes.NewBuffer(body)) if err != nil { return nil, err } var b bytes.Buffer html.Render(&b, root) fixedHtml := b // body = bytes.NewReader(fixedHtml) xmlroot, xmlerr := xmlpath.ParseHTML(bytes.NewReader(fixedHtml.Bytes())) if xmlerr != nil { return nil, xmlerr } path := xmlpath.MustCompile(xpath) if value, ok := path.Bytes(xmlroot); ok { return value, nil } return nil, errors.New("Could not find xpath") }
func getRootNode(urlToOpen string) *xmlpath.Node { req, err := http.NewRequest("GET", urlToOpen, nil) if err != nil { log("error1: " + err.Error()) } else { response, err := http.DefaultTransport.RoundTrip(req) if err != nil { log("Error 3: " + err.Error()) } else { defer response.Body.Close() content, err := ioutil.ReadAll(response.Body) if err != nil { log("Error 2: " + err.Error()) } else { root, err := html.Parse(bytes.NewReader(content)) if err != nil { log("Parse error: " + err.Error()) } var b bytes.Buffer html.Render(&b, root) fixedHtml := b.String() reader := strings.NewReader(fixedHtml) rootNode, err := xmlpath.ParseHTML(reader) if err != nil { log("Error 4: " + err.Error()) } else { return rootNode } } } } return nil }
//strip the <!DOCTYPE html> and <html> tag func stripHTML(filename string) error { targetHTML, err := os.OpenFile(filename, os.O_RDWR, 0666) if err != nil { return err } defer targetHTML.Close() target, err := html.Parse(targetHTML) if err != nil { return err } //node for <head> headNode := target.FirstChild.NextSibling.FirstChild err = targetHTML.Truncate(0) if err != nil { return err } _, err = targetHTML.Seek(0, 0) if err != nil { return err } for node := headNode; node != nil; node = node.NextSibling { err = html.Render(targetHTML, node) if err != nil { return err } } return nil }
func Test2(t *testing.T) { lg, lge := loghttp.Logger(nil, nil) doc, err := html.Parse(strings.NewReader(testDocs[0])) if err != nil { lge(err) return } removeCommentsAndIntertagWhitespace(NdX{doc, 0}) breakoutImagesFromAnchorTrees(doc) removeCommentsAndIntertagWhitespace(NdX{doc, 0}) reIndent(doc, 0) var b bytes.Buffer err = html.Render(&b, doc) lge(err) if b.String() != testDocs[1] { t.Errorf("output unexpted") } osutilpb.Bytes2File("outp1_inp.html", []byte(testDocs[0])) osutilpb.Dom2File("outp2_got.html", doc) osutilpb.Bytes2File("outp3_want.html", []byte(testDocs[1])) lg("end") }
func SpoonerizeHTML(r io.Reader, extraHTML string) io.ReadCloser { doc, _ := html.Parse(r) var f func(*html.Node) f = func(n *html.Node) { switch n.Type { case html.TextNode: n.Data = string(Spoonerize([]byte(n.Data))) case html.ElementNode: switch n.DataAtom { case atom.Style, atom.Script: return } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } if n.DataAtom == atom.Body { if extraHTML != "" { nodes, _ := html.ParseFragment(bytes.NewBufferString(extraHTML), n) for _, node := range nodes { n.AppendChild(node) } } } } f(doc) d := &bufferCloser{} html.Render(d, doc) return d }
// RenderClean renders the provided HTML node and returns it as a string. It is // a convenience function for html.Render. func RenderClean(node *html.Node) (htmlStr string, err error) { w := new(bytes.Buffer) err = html.Render(w, node) if err != nil { return "", err } return string(w.Bytes()), nil }
func createIndexMinHTMLFile(document *html.Node, dir string) { wrtr, err := os.Create(filepath.Join(dir, "index.min.html")) if err != nil { log.Fatalf("Error: could not open file for write: %v", err) } defer closeFile(wrtr, true) html.Render(wrtr, document) }
func (this *EpubMaker) saveChapter(root *html.Node, chapters []Chapter) { if !this.blank { buf := new(bytes.Buffer) html.Render(buf, root) this.book.AddChapter(chapters, buf.Bytes()) this.blank = true } }
// Dom2File writes DOM to file func Dom2File(fn string, node *html.Node) { lg, _ := loghttp.BuffLoggerUniversal(nil, nil) var b bytes.Buffer err := html.Render(&b, node) lg(err) Bytes2File(fn, b.Bytes()) }
func renderNode(node *html.Node) (string, error) { var buf bytes.Buffer err := html.Render(&buf, node) if err != nil { return "", err } return buf.String(), nil }
// Render nodes to a writer. // nil nodes are skipped. func (s Siblings) Render(w io.Writer) error { doc := &html.Node{ Type: html.DocumentNode, } first, last := s.convert(doc) doc.FirstChild = first doc.LastChild = last return html.Render(w, doc) }
func renderTree(node *html.Node) (string, error) { var b bytes.Buffer err := html.Render(&b, node) if err != nil { return "", err } return b.String(), nil }
func buildPlain(n *html.Node) (s string, err error) { buf := new(bytes.Buffer) err = html.Render(buf, n) if err != nil { return } s = buf.String() return }
func RenderNodes(w io.Writer, ns []*html.Node) error { for _, n := range ns { err := html.Render(w, n) if err != nil { return err } } return nil }
// Render is a convenience function that wraps html.Render and renders to a // string instead of an io.Writer. func Render(nodes ...*html.Node) string { var buf bytes.Buffer for _, n := range nodes { err := html.Render(&buf, n) expectError(err, nil) } return string(buf.Bytes()) }
// RenderComponents renders components into HTML, writing result to w. // Context-aware escaping is done just like in html/template when rendering nodes. func RenderComponents(w io.Writer, components ...Component) error { for _, c := range components { for _, node := range c.Render() { err := html.Render(w, node) if err != nil { return err } } } return nil }
// RenderComponentsContext renders components into HTML, writing result to w. // Context-aware escaping is done just like in html/template when rendering nodes. // // RenderComponentsContext is experimental and may be changed or removed. func RenderComponentsContext(ctx context.Context, w io.Writer, components ...ComponentContext) error { for _, c := range components { for _, node := range c.RenderContext(ctx) { err := html.Render(w, node) if err != nil { return err } } } return nil }
// fmtHTML parses and re-emits 'in', effectively canonicalizing it. func fmtHTML(in string) string { doc, err := html.Parse(strings.NewReader(in)) if err != nil { panic(err) } out := &bytes.Buffer{} if err := html.Render(out, doc); err != nil { panic(err) } return string(out.Bytes()) }
func getTagName(node *html.Node) string { orphanNode := &html.Node{ Type: node.Type, DataAtom: node.DataAtom, Data: node.Data, Namespace: node.Namespace, Attr: node.Attr, } var buffer bytes.Buffer html.Render(&buffer, orphanNode) return buffer.String() }
// OuterHtml returns the outer HTML rendering of the first item in // the selection - that is, the HTML including the first element's // tag and attributes. // // Unlike InnerHtml, this is a function and not a method on the Selection, // because this is not a jQuery method (in javascript-land, this is // a property provided by the DOM). func OuterHtml(s *Selection) (string, error) { var buf bytes.Buffer if s.Length() == 0 { return "", nil } n := s.Get(0) if err := html.Render(&buf, n); err != nil { return "", err } return buf.String(), nil }
func writeHTML(orig, dest string, root *html.Node) error { dir := filepath.Dir(orig) base := filepath.Base(orig) os.MkdirAll(filepath.Join(dest, dir), 0755) out, err := os.Create(filepath.Join(dest, dir, base)) if err != nil { return err } defer out.Close() return html.Render(out, root) }
func Fuzz(data []byte) int { nodes, err := html.ParseFragment(bytes.NewReader(data), nil) if err != nil { return 0 } for _, n := range nodes { if err := html.Render(ioutil.Discard, n); err != nil { panic(err) } } return 1 }
func (n *Node) Render() *string { if n == nil { return nil } var b bytes.Buffer err := html.Render(&b, &n.n) if err != nil { return nil } s := b.String() return &s }
// Render renders HTML nodes, returning result as template.HTML. // Context-aware escaping is done just like in html/template when rendering nodes. // // TODO: Return string instead of template.HTML; returning template.HTML has proven to be unhelpful (since so many consumers expect a simple string). func Render(nodes ...*html.Node) template.HTML { var buf bytes.Buffer for _, node := range nodes { err := html.Render(&buf, node) if err != nil { // html.Render should only return a non-nil error if there's a problem writing to the supplied io.Writer. // We don't expect that to ever be the case (unless there's not enough memory), so panic. // If this ever happens in other situations, it's a bug in this library that should be reported and fixed. panic(fmt.Errorf("internal error: html.Render returned non-nil error, this is not expected to happen: %v", err)) } } return template.HTML(buf.String()) }