Exemplo n.º 1
0
// grab text of matching selector, return as one line
func sectionFromElement(root *html.Node, sel cascadia.Selector) string {
	el := sel.MatchFirst(root)
	if el == nil {
		return ""
	}
	return compressSpace(getTextContent(el))
}
Exemplo n.º 2
0
func closest(n *html.Node, sel cascadia.Selector) *html.Node {
	for n != nil {
		if sel.Match(n) {
			break
		}
		n = n.Parent
	}
	return n
}
Exemplo n.º 3
0
// getElemText returns the innerText of the selected node.
func getElemText(n *html.Node, sel cascadia.Selector) string {
	if node := sel.MatchFirst(n); node != nil {
		return htmlutil.InnerText(node)
	}
	return ""
}
Exemplo n.º 4
0
// Tidy up extracted content into something that'll produce reasonable html when
// rendered
// - remove comments
// - trim empty text nodes
// - TODO make links absolute
func tidyNode(node *html.Node) {
	var commentSel cascadia.Selector = func(n *html.Node) bool {
		return n.Type == html.CommentNode
	}
	var textSel cascadia.Selector = func(n *html.Node) bool {
		return n.Type == html.TextNode
	}
	var elementSel cascadia.Selector = func(n *html.Node) bool {
		return n.Type == html.ElementNode
	}
	var imgSel cascadia.Selector = func(n *html.Node) bool {
		return n.Type == html.ElementNode && n.DataAtom == atom.Img
	}

	// remove all comments
	for _, n := range commentSel.MatchAll(node) {
		n.Parent.RemoveChild(n)
	}

	leadingSpace := regexp.MustCompile(`^\s+`)
	trailingSpace := regexp.MustCompile(`\s+$`)
	// trim excessive leading/trailing space in text nodes, and cull empty ones
	for _, n := range textSel.MatchAll(node) {
		txt := leadingSpace.ReplaceAllStringFunc(n.Data, func(in string) string {
			if strings.Contains(in, "\n") {
				return "\n"
			} else {
				return " "
			}
		})
		txt = trailingSpace.ReplaceAllStringFunc(n.Data, func(in string) string {
			if strings.Contains(in, "\n") {
				return "\n"
			} else {
				return " "
			}
		})
		if len(strings.TrimSpace(txt)) == 0 {
			n.Parent.RemoveChild(n)
		} else {
			n.Data = txt
		}
	}

	// remove any elements or attrs not on the whitelist
	for _, n := range elementSel.MatchAll(node) {
		allowedAttrs, whiteListed := elementWhitelist[n.DataAtom]
		if !whiteListed {
			if n.Parent != nil {
				n.Parent.RemoveChild(n)
			}
			continue
		}
		filterAttrs(n, func(attr *html.Attribute) bool {
			for _, allowed := range allowedAttrs {
				if attr.Key == allowed.String() {
					return true
				}
			}
			return false
		})
	}

	// special pass for images - strip out ones with huge URIs (eg embedded
	// 'data:' + base64 encoded images)
	const maxSrcURI = 1024
	for _, img := range imgSel.MatchAll(node) {
		src := getAttr(img, "src")
		if len(src) > maxSrcURI {
			img.Parent.RemoveChild(img)
			continue
		}
	}
}