Beispiel #1
0
// Archive / upload final entity HTML
func finalizeEntity(entity *Entity, doc *html.Node, entDir string) {
	log.Println("Entity tmp directory: ", entDir)

	var finalHTML bytes.Buffer
	bl := html.Render(&finalHTML, doc)
	if bl != nil {
		log.Println(bl)
	}

	err := StoreHTML(finalHTML, entDir)
	if err != nil {
		log.Println("Error in StoreHTML: ", err)
	}

	zipName, err := ArchiveFinalFiles(entDir)
	if err != nil {
		log.Println("Error archiving files: ", err)
	}

	err = Update(entity, bson.M{"uuid": entity.UUID}, bson.M{"$set": bson.M{"status": UploadingStatus}})
	if err != nil {
		log.Println("Error updating entity: ", err)
	}

	awsLink, err := UploadEntity(zipName, entity)
	if err != nil {
		log.Println("Error uploading final files: ", err)
	}

	err = Update(entity, bson.M{"uuid": entity.UUID}, bson.M{"$set": bson.M{"aws_link": awsLink, "status": CompleteStatus}})
	if err != nil {
		log.Println("Error updating entity: ", err)
	}
}
Beispiel #2
0
func FetchFullDescription(link string) string {
	res, err := http.Get(link)
	if err != nil {
		log.Fatal(err)
	}
	body, err := ioutil.ReadAll(res.Body)
	res.Body.Close()
	if err != nil {
		log.Fatal(err)
	}
	doc, err := html.Parse(strings.NewReader(string(body)))
	content := ""
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "section" {
			for _, a := range n.Attr {
				if a.Key == "class" && a.Val == "entry-content cf" {
					var buf bytes.Buffer
					html.Render(&buf, n)
					content = buf.String()
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)
	return content
}
Beispiel #3
0
// FixHtml parses bytes as HTML and returns well-formed HTML if the parse
// was successful, or escaped HTML, if not.
func fixHtml(linkUrl string, wild []byte) (well []byte) {
	n, err := html.Parse(bytes.NewReader(wild))
	if err != nil {
		return []byte(html.EscapeString(string(wild)))
	}

	fixImgs(linkUrl, n)

	defer func() {
		if err := recover(); err == bytes.ErrTooLarge {
			well = []byte(html.EscapeString(string(wild)))
		} else if err != nil {
			panic(err)
		}
	}()
	buf := bytes.NewBuffer(make([]byte, 0, len(wild)*2))
	if err := html.Render(buf, n); err != nil {
		return []byte(html.EscapeString(string(wild)))
	}

	well = buf.Bytes()
	openBody := []byte("<body>")
	i := bytes.Index(well, openBody)
	if i < 0 {
		return []byte(html.EscapeString(string(wild)))
	}
	well = well[i+len(openBody):]

	closeBody := []byte("</body>")
	i = bytes.Index(well, closeBody)
	if i < 0 {
		return []byte(html.EscapeString(string(wild)))
	}
	return well[:i]
}
Beispiel #4
0
// StrFromNodes returns the string of the rendered html.Nodes.
func StrFromNodes(nodes []*html.Node) string {
	buf := bytes.NewBuffer([]byte{})
	for _, h := range nodes {
		html.Render(buf, h)
	}
	return buf.String()
}
Beispiel #5
0
func (n *node) String() string {
	var buf bytes.Buffer
	if err := html.Render(&buf, n.node); err != nil {
		panic(err)
	}
	return buf.String()
}
Beispiel #6
0
func DomTree2HTML(DOMTree []*html.Node) {
	HTML5 := bytes.NewBuffer([]byte{})
	for _, node := range DOMTree {
		html.Render(HTML5, node)
	}
	fmt.Printf("Node: %s", HTML5)
}
Beispiel #7
0
func Redact(r io.Reader) (string, error) {
	doc, err := html.Parse(r)
	if err != nil {
		return "", err
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			if c.Type == html.ElementNode {
				switch c.Data {
				case "style", "script", "head", "meta":
					n.RemoveChild(c)
					return
				case "img":
					for i, attr := range c.Attr {
						if attr.Key == "src" {
							c.Attr[i].Key = "data-redacted-src"
						}
					}
				}
			} else if c.Type == html.CommentNode {
				n.RemoveChild(c)
				return
			}
			f(c)
		}
	}
	f(doc)
	buf := bytes.NewBufferString("")
	err = html.Render(buf, doc)
	return buf.String(), err
}
Beispiel #8
0
func renderHtml(tree *html.Node, t *testing.T) string {
	var wr bytes.Buffer
	if err := html.Render(&wr, tree); err != nil {
		t.Errorf("html rendering error: %s", err.Error())
	}
	return wr.String()

}
Beispiel #9
0
func (p Xpath) Print(w io.Writer, n *html.Node) error {
	node, err := p.Parse(n)
	if err != nil {
		return err
	}
	html.Render(w, node)
	return nil
}
Beispiel #10
0
func html_write_file(article *html.Node, dir string) (string, error) {
	f, err := ioutil.TempFile(dir, "html.")
	if err != nil {
		return "", err
	}
	defer f.Close()
	err = html.Render(f, article)
	return f.Name(), err
}
Beispiel #11
0
func renderHTML(options Options, file HTMLFile) string {
	handle := must(os.Open(file.File.LocalPath)).(*os.File)
	defer handle.Close()

	doc := must(html.Parse(handle)).(*html.Node)

	var f func(*html.Node)
	f = func(n *html.Node) {
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}

		if n.Type == html.ElementNode {
			switch n.Data {
			case "script":
				for i, a := range n.Attr {
					if a.Key == "src" {
						for _, dep := range file.Deps {
							if dep.InstPath == a.Val {
								n.Attr[i].Val = formatHref(dep.File.UploadedPath)
								break
							}
						}
					}
				}
			case "link":
				stylesheet := false
				for _, a := range n.Attr {
					if a.Key == "rel" {
						stylesheet = a.Val == "stylesheet"
						break
					}
				}
				if !stylesheet {
					return
				}

				for i, a := range n.Attr {
					if a.Key == "href" {
						for _, dep := range file.Deps {
							if dep.InstPath == a.Val {
								n.Attr[i].Val = formatHref(dep.File.UploadedPath)
								break
							}
						}
					}
				}
			}
		}
	}
	f(doc)

	buf := bytes.NewBuffer([]byte{})
	panicIf(html.Render(buf, doc))

	return buf.String()
}
Beispiel #12
0
func RenderNodes(w io.Writer, ns []*html.Node) error {
	for _, n := range ns {
		err := html.Render(w, n)
		if err != nil {
			return err
		}
	}
	return nil
}
Beispiel #13
0
// write html.Node to tmp file
// return tmp_filename, utf-8 encoded
func WriteHtmlFile2(doc *html.Node) (string, error) {
	of, err := ioutil.TempFile(tmp_dir, prefix)
	if err != nil {
		return "", err
	}
	defer of.Close()

	html.Render(of, doc)
	return of.Name(), nil
}
Beispiel #14
0
func write_file(doc *html.Node, temp string) (string, error) {
	of, err := ioutil.TempFile(temp, "html.")
	if err != nil {
		return "", err
	}
	defer of.Close()

	html.Render(of, doc)
	return of.Name(), nil
}
Beispiel #15
0
func writeXpaths(w io.Writer, doc *html.Node, xpath []string) error {
	for _, p := range xpath {
		c, err := NewXpath(p).Parse(doc)
		if err != nil {
			return err
		}
		html.Render(w, c)
	}
	return nil
}
Beispiel #16
0
func (t *minionTransport) ProcessResponse(req *http.Request, resp *http.Response) (*http.Response, error) {
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		// copying the response body did not work
		return nil, err
	}

	bodyNode := &html.Node{
		Type:     html.ElementNode,
		Data:     "body",
		DataAtom: atom.Body,
	}
	nodes, err := html.ParseFragment(bytes.NewBuffer(body), bodyNode)
	if err != nil {
		glog.Errorf("Failed to found <body> node: %v", err)
		return resp, err
	}

	// Define the method to traverse the doc tree and update href node to
	// point to correct minion
	var updateHRef func(*html.Node)
	updateHRef = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for i, attr := range n.Attr {
				if attr.Key == "href" {
					Url := &url.URL{
						Path: "/proxy/minion/" + req.URL.Host + req.URL.Path + attr.Val,
					}
					n.Attr[i].Val = Url.String()
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			updateHRef(c)
		}
	}

	newContent := &bytes.Buffer{}
	for _, n := range nodes {
		updateHRef(n)
		err = html.Render(newContent, n)
		if err != nil {
			glog.Errorf("Failed to render: %v", err)
		}
	}

	resp.Body = ioutil.NopCloser(newContent)
	// Update header node with new content-length
	// TODO: Remove any hash/signature headers here?
	resp.Header.Del("Content-Length")
	resp.ContentLength = int64(newContent.Len())

	return resp, err
}
Beispiel #17
0
// fmtHTML parses and re-emits 'in', effectively canonicalizing it.
func fmtHTML(in string) string {
	doc, err := html.Parse(strings.NewReader(in))
	if err != nil {
		panic(err)
	}
	out := &bytes.Buffer{}
	if err := html.Render(out, doc); err != nil {
		panic(err)
	}
	return string(out.Bytes())
}
Beispiel #18
0
func MustParse(url string) *html.Node {
	body := MustGet(url)
	node, err := html.Parse(body)
	checkError(err)

	filename := strings.Split(url, "/")
	f, err := os.Create(filename[len(filename)-1] + ".html")
	checkError(err)
	defer f.Close()

	html.Render(f, node)
	return node
}
Beispiel #19
0
//return content and docsummary
func clean_fragment(cont, uri string) (string, *DocumentSummary) {
	doc, err := html.Parse(strings.NewReader(cont))
	if err != nil {
		return cont, &DocumentSummary{}
	}

	article, _ := html_clean_root(doc, uri)
	_, body := flat_html(article)
	body.Data = "div" // remvoe body

	var buf bytes.Buffer
	err = html.Render(&buf, body)
	return buf.String(), new_docsummary(body, nil)
}
Beispiel #20
0
func (this *flowdocument_maker) make(frag *html.Node, imgs []feed.FeedMedia) string {
	if frag == nil || frag.Type != html.ElementNode {
		return empty_flowdocument
	}

	this.convert_flowdocument(frag)
	this.insert_images(imgs)
	node_clean_empty(frag)
	var buffer bytes.Buffer
	html.Render(&buffer, frag) // ignore return error
	body := buffer.String()

	return body
}
Beispiel #21
0
func tidyHtml(input []byte) ([]byte, error) {
	// tidy
	nodes, err := html.ParseFragment(bytes.NewReader(input), nil)
	if err != nil {
		return nil, err
	}
	buf := new(bytes.Buffer)
	for _, node := range nodes {
		err = html.Render(buf, node)
		if err != nil {
			return nil, err
		}
	}
	return buf.Bytes(), nil
}
Beispiel #22
0
//return local_filepath, words, images
func CleanFragment(cont, uri string) (string, *SummaryScore) {
	doc, err := html.Parse(strings.NewReader(cont))
	if err != nil {
		return cont, &SummaryScore{}
	}

	cleaner := NewHtmlCleaner(uri)
	cleaner.CleanHtml(doc)
	_, body := FlattenHtmlDocument(cleaner.Article)
	body.Data = "div" // remvoe body

	var buf bytes.Buffer
	err = html.Render(&buf, body)
	return buf.String(), NewSummaryScore(body)
}
Beispiel #23
0
// unwrap non whitelisted elements from a full HTML document
func (w *Whitelist) SanitizeUnwrap(reader io.Reader) (string, error) {
	var buffer bytes.Buffer

	doc, err := html.Parse(reader)
	if err != nil {
		return buffer.String(), err
	}

	err = w.sanitizeUnwrap(doc)
	if err != nil {
		return buffer.String(), err
	}

	err = html.Render(&buffer, doc)

	return buffer.String(), err
}
Beispiel #24
0
// Html gets the HTML contents of the first element in the set of matched
// elements. It includes text and comment nodes.
func (s *Selection) Html() (ret string, e error) {
	// Since there is no .innerHtml, the HTML content must be re-created from
	// the nodes using html.Render.
	var buf bytes.Buffer

	if len(s.Nodes) > 0 {
		for c := s.Nodes[0].FirstChild; c != nil; c = c.NextSibling {
			e = html.Render(&buf, c)
			if e != nil {
				return
			}
		}
		ret = buf.String()
	}

	return
}
Beispiel #25
0
// render for every child of the node provided,
// render that node into the provided buffer
// after performing the provided function on it
func renderForEachChild(n *html.Node, buffer *bytes.Buffer, fn func(*html.Node) error) error {
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		err := fn(c)
		if err != nil {
			return err
		}

		if c.Parent == n {
			// this node wasn't removed
			err = html.Render(buffer, c)
			if err != nil {
				return err
			}
		}
	}

	return nil
}
Beispiel #26
0
// CreationDateSaved gets the creation date of an HTML file, and also writes
// that HTML file back in place with an updated meta element with the
// creation time if that information doesn't already exist.
func CreationDateSaved(path string) (*FileInfo, error) {
	fileinfo, update, err := CreationDate(path)
	if err != nil {
		return nil, err
	}
	if update {
		f, err := os.Create(path)
		if err != nil {
			return nil, err
		}
		defer f.Close()

		err = html.Render(f, fileinfo.Node)
		if err != nil {
			return nil, err
		}
	}
	return fileinfo, err
}
Beispiel #27
0
// Make a GET request to the given URL and start parsing
// its HTML.
func ExtractData(entity *Entity, url string) {
	// Parsing completion channel.
	done := make(chan bool, 1)

	res, err := http.Get(url)
	if err != nil {
		log.Println("Error requesting URL data: ", err)
	}

	defer res.Body.Close()

	doc, err := html.Parse(res.Body)
	if err != nil {
		log.Println("Error parsing URL body: ", err)
	}

	go ParseHTML(doc, entity, done)

	for {
		select {
		case <-done:
			var finalHTML bytes.Buffer
			bl := html.Render(&finalHTML, doc)
			if bl != nil {
				log.Println(bl)
			}

			err := StoreHTML(finalHTML, EntityDir)
			if err != nil {
				log.Println("Error in StoreHTML: ", err)
			}

			// err = ArchiveFinalFiles(EntityDir)
			// if err != nil {
			// 	log.Println("Error in archive final files: ", err)
			// }
		default:
		}
	}
}
Beispiel #28
0
// fixLinks modifies links in an HTML file such that they will be redirected through the proxy if needed.
func (t *proxyTransport) fixLinks(req *http.Request, resp *http.Response) (*http.Response, error) {
	defer resp.Body.Close()

	doc, err := html.Parse(resp.Body)
	if err != nil {
		glog.Errorf("Parse failed: %v", err)
		return resp, err
	}

	newContent := &bytes.Buffer{}
	t.scan(doc, func(n *html.Node) { t.updateURLs(n, req.URL) })
	if err := html.Render(newContent, doc); err != nil {
		glog.Errorf("Failed to render: %v", err)
	}

	resp.Body = ioutil.NopCloser(newContent)
	// Update header node with new content-length
	// TODO: Remove any hash/signature headers here?
	resp.Header.Del("Content-Length")
	resp.ContentLength = int64(newContent.Len())

	return resp, err
}
Beispiel #29
0
func fixRelativeLinks(doc, repo, ref, body string) (string, error) {
	repoAndRef := repo
	if ref != "master" {
		repoAndRef += "~" + ref
	}
	n, err := html.Parse(strings.NewReader(string(body)))
	if err != nil {
		return "", err
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for i, a := range n.Attr {
				if a.Key == "href" {
					fs := strings.Index(a.Val, "/")
					fc := strings.Index(a.Val, ":")
					fh := strings.Index(a.Val, "#")
					if fs == 0 || fh == 0 ||
						(fc >= 0 && fc < fs) ||
						(fh >= 0 && fh < fs) {
						continue
					}
					n.Attr[i].Val = "/" + repoAndRef + "/" + a.Val
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(n)
	b := new(bytes.Buffer)
	if err := html.Render(b, n); err != nil {
		return "", err
	}
	return b.String(), nil
}
Beispiel #30
0
func ModifyHTML(r *http.Request, s string) string {

	UnsyncedGlobalReq = r

	var docRoot *html.Node
	var err error
	r1 := strings.NewReader(s)
	log.Printf("len is %v\n", len(s))

	docRoot, err = html.Parse(r1)
	if err != nil {
		panic(fmt.Sprintf("3 %v \n", err))
	}
	fRecurse(docRoot)

	var b bytes.Buffer
	err = html.Render(&b, docRoot)
	if err != nil {
		panic(fmt.Sprintf("4 %v \n", err))
	}
	log.Printf("len is %v\n", b.Len())

	return b.String()
}