// Archive / upload final entity HTML func finalizeEntity(entity *Entity, doc *html.Node, entDir string) { log.Println("Entity tmp directory: ", entDir) var finalHTML bytes.Buffer bl := html.Render(&finalHTML, doc) if bl != nil { log.Println(bl) } err := StoreHTML(finalHTML, entDir) if err != nil { log.Println("Error in StoreHTML: ", err) } zipName, err := ArchiveFinalFiles(entDir) if err != nil { log.Println("Error archiving files: ", err) } err = Update(entity, bson.M{"uuid": entity.UUID}, bson.M{"$set": bson.M{"status": UploadingStatus}}) if err != nil { log.Println("Error updating entity: ", err) } awsLink, err := UploadEntity(zipName, entity) if err != nil { log.Println("Error uploading final files: ", err) } err = Update(entity, bson.M{"uuid": entity.UUID}, bson.M{"$set": bson.M{"aws_link": awsLink, "status": CompleteStatus}}) if err != nil { log.Println("Error updating entity: ", err) } }
func FetchFullDescription(link string) string { res, err := http.Get(link) if err != nil { log.Fatal(err) } body, err := ioutil.ReadAll(res.Body) res.Body.Close() if err != nil { log.Fatal(err) } doc, err := html.Parse(strings.NewReader(string(body))) content := "" var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "section" { for _, a := range n.Attr { if a.Key == "class" && a.Val == "entry-content cf" { var buf bytes.Buffer html.Render(&buf, n) content = buf.String() break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) return content }
// FixHtml parses bytes as HTML and returns well-formed HTML if the parse // was successful, or escaped HTML, if not. func fixHtml(linkUrl string, wild []byte) (well []byte) { n, err := html.Parse(bytes.NewReader(wild)) if err != nil { return []byte(html.EscapeString(string(wild))) } fixImgs(linkUrl, n) defer func() { if err := recover(); err == bytes.ErrTooLarge { well = []byte(html.EscapeString(string(wild))) } else if err != nil { panic(err) } }() buf := bytes.NewBuffer(make([]byte, 0, len(wild)*2)) if err := html.Render(buf, n); err != nil { return []byte(html.EscapeString(string(wild))) } well = buf.Bytes() openBody := []byte("<body>") i := bytes.Index(well, openBody) if i < 0 { return []byte(html.EscapeString(string(wild))) } well = well[i+len(openBody):] closeBody := []byte("</body>") i = bytes.Index(well, closeBody) if i < 0 { return []byte(html.EscapeString(string(wild))) } return well[:i] }
// StrFromNodes returns the string of the rendered html.Nodes. func StrFromNodes(nodes []*html.Node) string { buf := bytes.NewBuffer([]byte{}) for _, h := range nodes { html.Render(buf, h) } return buf.String() }
func (n *node) String() string { var buf bytes.Buffer if err := html.Render(&buf, n.node); err != nil { panic(err) } return buf.String() }
func DomTree2HTML(DOMTree []*html.Node) { HTML5 := bytes.NewBuffer([]byte{}) for _, node := range DOMTree { html.Render(HTML5, node) } fmt.Printf("Node: %s", HTML5) }
func Redact(r io.Reader) (string, error) { doc, err := html.Parse(r) if err != nil { return "", err } var f func(*html.Node) f = func(n *html.Node) { for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.ElementNode { switch c.Data { case "style", "script", "head", "meta": n.RemoveChild(c) return case "img": for i, attr := range c.Attr { if attr.Key == "src" { c.Attr[i].Key = "data-redacted-src" } } } } else if c.Type == html.CommentNode { n.RemoveChild(c) return } f(c) } } f(doc) buf := bytes.NewBufferString("") err = html.Render(buf, doc) return buf.String(), err }
func renderHtml(tree *html.Node, t *testing.T) string { var wr bytes.Buffer if err := html.Render(&wr, tree); err != nil { t.Errorf("html rendering error: %s", err.Error()) } return wr.String() }
func (p Xpath) Print(w io.Writer, n *html.Node) error { node, err := p.Parse(n) if err != nil { return err } html.Render(w, node) return nil }
func html_write_file(article *html.Node, dir string) (string, error) { f, err := ioutil.TempFile(dir, "html.") if err != nil { return "", err } defer f.Close() err = html.Render(f, article) return f.Name(), err }
func renderHTML(options Options, file HTMLFile) string { handle := must(os.Open(file.File.LocalPath)).(*os.File) defer handle.Close() doc := must(html.Parse(handle)).(*html.Node) var f func(*html.Node) f = func(n *html.Node) { for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } if n.Type == html.ElementNode { switch n.Data { case "script": for i, a := range n.Attr { if a.Key == "src" { for _, dep := range file.Deps { if dep.InstPath == a.Val { n.Attr[i].Val = formatHref(dep.File.UploadedPath) break } } } } case "link": stylesheet := false for _, a := range n.Attr { if a.Key == "rel" { stylesheet = a.Val == "stylesheet" break } } if !stylesheet { return } for i, a := range n.Attr { if a.Key == "href" { for _, dep := range file.Deps { if dep.InstPath == a.Val { n.Attr[i].Val = formatHref(dep.File.UploadedPath) break } } } } } } } f(doc) buf := bytes.NewBuffer([]byte{}) panicIf(html.Render(buf, doc)) return buf.String() }
func RenderNodes(w io.Writer, ns []*html.Node) error { for _, n := range ns { err := html.Render(w, n) if err != nil { return err } } return nil }
// write html.Node to tmp file // return tmp_filename, utf-8 encoded func WriteHtmlFile2(doc *html.Node) (string, error) { of, err := ioutil.TempFile(tmp_dir, prefix) if err != nil { return "", err } defer of.Close() html.Render(of, doc) return of.Name(), nil }
func write_file(doc *html.Node, temp string) (string, error) { of, err := ioutil.TempFile(temp, "html.") if err != nil { return "", err } defer of.Close() html.Render(of, doc) return of.Name(), nil }
func writeXpaths(w io.Writer, doc *html.Node, xpath []string) error { for _, p := range xpath { c, err := NewXpath(p).Parse(doc) if err != nil { return err } html.Render(w, c) } return nil }
func (t *minionTransport) ProcessResponse(req *http.Request, resp *http.Response) (*http.Response, error) { body, err := ioutil.ReadAll(resp.Body) if err != nil { // copying the response body did not work return nil, err } bodyNode := &html.Node{ Type: html.ElementNode, Data: "body", DataAtom: atom.Body, } nodes, err := html.ParseFragment(bytes.NewBuffer(body), bodyNode) if err != nil { glog.Errorf("Failed to found <body> node: %v", err) return resp, err } // Define the method to traverse the doc tree and update href node to // point to correct minion var updateHRef func(*html.Node) updateHRef = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for i, attr := range n.Attr { if attr.Key == "href" { Url := &url.URL{ Path: "/proxy/minion/" + req.URL.Host + req.URL.Path + attr.Val, } n.Attr[i].Val = Url.String() break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { updateHRef(c) } } newContent := &bytes.Buffer{} for _, n := range nodes { updateHRef(n) err = html.Render(newContent, n) if err != nil { glog.Errorf("Failed to render: %v", err) } } resp.Body = ioutil.NopCloser(newContent) // Update header node with new content-length // TODO: Remove any hash/signature headers here? resp.Header.Del("Content-Length") resp.ContentLength = int64(newContent.Len()) return resp, err }
// fmtHTML parses and re-emits 'in', effectively canonicalizing it. func fmtHTML(in string) string { doc, err := html.Parse(strings.NewReader(in)) if err != nil { panic(err) } out := &bytes.Buffer{} if err := html.Render(out, doc); err != nil { panic(err) } return string(out.Bytes()) }
func MustParse(url string) *html.Node { body := MustGet(url) node, err := html.Parse(body) checkError(err) filename := strings.Split(url, "/") f, err := os.Create(filename[len(filename)-1] + ".html") checkError(err) defer f.Close() html.Render(f, node) return node }
//return content and docsummary func clean_fragment(cont, uri string) (string, *DocumentSummary) { doc, err := html.Parse(strings.NewReader(cont)) if err != nil { return cont, &DocumentSummary{} } article, _ := html_clean_root(doc, uri) _, body := flat_html(article) body.Data = "div" // remvoe body var buf bytes.Buffer err = html.Render(&buf, body) return buf.String(), new_docsummary(body, nil) }
func (this *flowdocument_maker) make(frag *html.Node, imgs []feed.FeedMedia) string { if frag == nil || frag.Type != html.ElementNode { return empty_flowdocument } this.convert_flowdocument(frag) this.insert_images(imgs) node_clean_empty(frag) var buffer bytes.Buffer html.Render(&buffer, frag) // ignore return error body := buffer.String() return body }
func tidyHtml(input []byte) ([]byte, error) { // tidy nodes, err := html.ParseFragment(bytes.NewReader(input), nil) if err != nil { return nil, err } buf := new(bytes.Buffer) for _, node := range nodes { err = html.Render(buf, node) if err != nil { return nil, err } } return buf.Bytes(), nil }
//return local_filepath, words, images func CleanFragment(cont, uri string) (string, *SummaryScore) { doc, err := html.Parse(strings.NewReader(cont)) if err != nil { return cont, &SummaryScore{} } cleaner := NewHtmlCleaner(uri) cleaner.CleanHtml(doc) _, body := FlattenHtmlDocument(cleaner.Article) body.Data = "div" // remvoe body var buf bytes.Buffer err = html.Render(&buf, body) return buf.String(), NewSummaryScore(body) }
// unwrap non whitelisted elements from a full HTML document func (w *Whitelist) SanitizeUnwrap(reader io.Reader) (string, error) { var buffer bytes.Buffer doc, err := html.Parse(reader) if err != nil { return buffer.String(), err } err = w.sanitizeUnwrap(doc) if err != nil { return buffer.String(), err } err = html.Render(&buffer, doc) return buffer.String(), err }
// Html gets the HTML contents of the first element in the set of matched // elements. It includes text and comment nodes. func (s *Selection) Html() (ret string, e error) { // Since there is no .innerHtml, the HTML content must be re-created from // the nodes using html.Render. var buf bytes.Buffer if len(s.Nodes) > 0 { for c := s.Nodes[0].FirstChild; c != nil; c = c.NextSibling { e = html.Render(&buf, c) if e != nil { return } } ret = buf.String() } return }
// render for every child of the node provided, // render that node into the provided buffer // after performing the provided function on it func renderForEachChild(n *html.Node, buffer *bytes.Buffer, fn func(*html.Node) error) error { for c := n.FirstChild; c != nil; c = c.NextSibling { err := fn(c) if err != nil { return err } if c.Parent == n { // this node wasn't removed err = html.Render(buffer, c) if err != nil { return err } } } return nil }
// CreationDateSaved gets the creation date of an HTML file, and also writes // that HTML file back in place with an updated meta element with the // creation time if that information doesn't already exist. func CreationDateSaved(path string) (*FileInfo, error) { fileinfo, update, err := CreationDate(path) if err != nil { return nil, err } if update { f, err := os.Create(path) if err != nil { return nil, err } defer f.Close() err = html.Render(f, fileinfo.Node) if err != nil { return nil, err } } return fileinfo, err }
// Make a GET request to the given URL and start parsing // its HTML. func ExtractData(entity *Entity, url string) { // Parsing completion channel. done := make(chan bool, 1) res, err := http.Get(url) if err != nil { log.Println("Error requesting URL data: ", err) } defer res.Body.Close() doc, err := html.Parse(res.Body) if err != nil { log.Println("Error parsing URL body: ", err) } go ParseHTML(doc, entity, done) for { select { case <-done: var finalHTML bytes.Buffer bl := html.Render(&finalHTML, doc) if bl != nil { log.Println(bl) } err := StoreHTML(finalHTML, EntityDir) if err != nil { log.Println("Error in StoreHTML: ", err) } // err = ArchiveFinalFiles(EntityDir) // if err != nil { // log.Println("Error in archive final files: ", err) // } default: } } }
// fixLinks modifies links in an HTML file such that they will be redirected through the proxy if needed. func (t *proxyTransport) fixLinks(req *http.Request, resp *http.Response) (*http.Response, error) { defer resp.Body.Close() doc, err := html.Parse(resp.Body) if err != nil { glog.Errorf("Parse failed: %v", err) return resp, err } newContent := &bytes.Buffer{} t.scan(doc, func(n *html.Node) { t.updateURLs(n, req.URL) }) if err := html.Render(newContent, doc); err != nil { glog.Errorf("Failed to render: %v", err) } resp.Body = ioutil.NopCloser(newContent) // Update header node with new content-length // TODO: Remove any hash/signature headers here? resp.Header.Del("Content-Length") resp.ContentLength = int64(newContent.Len()) return resp, err }
func fixRelativeLinks(doc, repo, ref, body string) (string, error) { repoAndRef := repo if ref != "master" { repoAndRef += "~" + ref } n, err := html.Parse(strings.NewReader(string(body))) if err != nil { return "", err } var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for i, a := range n.Attr { if a.Key == "href" { fs := strings.Index(a.Val, "/") fc := strings.Index(a.Val, ":") fh := strings.Index(a.Val, "#") if fs == 0 || fh == 0 || (fc >= 0 && fc < fs) || (fh >= 0 && fh < fs) { continue } n.Attr[i].Val = "/" + repoAndRef + "/" + a.Val } } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(n) b := new(bytes.Buffer) if err := html.Render(b, n); err != nil { return "", err } return b.String(), nil }
func ModifyHTML(r *http.Request, s string) string { UnsyncedGlobalReq = r var docRoot *html.Node var err error r1 := strings.NewReader(s) log.Printf("len is %v\n", len(s)) docRoot, err = html.Parse(r1) if err != nil { panic(fmt.Sprintf("3 %v \n", err)) } fRecurse(docRoot) var b bytes.Buffer err = html.Render(&b, docRoot) if err != nil { panic(fmt.Sprintf("4 %v \n", err)) } log.Printf("len is %v\n", b.Len()) return b.String() }