Esempio n. 1
0
// Tidy up extracted content into something that'll produce reasonable html when
// rendered
// - remove comments
// - trim empty text nodes
// - TODO make links absolute
func tidyNode(node *html.Node) {
	var commentSel cascadia.Selector = func(n *html.Node) bool {
		return n.Type == html.CommentNode
	}
	var textSel cascadia.Selector = func(n *html.Node) bool {
		return n.Type == html.TextNode
	}
	var elementSel cascadia.Selector = func(n *html.Node) bool {
		return n.Type == html.ElementNode
	}
	var imgSel cascadia.Selector = func(n *html.Node) bool {
		return n.Type == html.ElementNode && n.DataAtom == atom.Img
	}

	// remove all comments
	for _, n := range commentSel.MatchAll(node) {
		n.Parent.RemoveChild(n)
	}

	leadingSpace := regexp.MustCompile(`^\s+`)
	trailingSpace := regexp.MustCompile(`\s+$`)
	// trim excessive leading/trailing space in text nodes, and cull empty ones
	for _, n := range textSel.MatchAll(node) {
		txt := leadingSpace.ReplaceAllStringFunc(n.Data, func(in string) string {
			if strings.Contains(in, "\n") {
				return "\n"
			} else {
				return " "
			}
		})
		txt = trailingSpace.ReplaceAllStringFunc(n.Data, func(in string) string {
			if strings.Contains(in, "\n") {
				return "\n"
			} else {
				return " "
			}
		})
		if len(strings.TrimSpace(txt)) == 0 {
			n.Parent.RemoveChild(n)
		} else {
			n.Data = txt
		}
	}

	// remove any elements or attrs not on the whitelist
	for _, n := range elementSel.MatchAll(node) {
		allowedAttrs, whiteListed := elementWhitelist[n.DataAtom]
		if !whiteListed {
			if n.Parent != nil {
				n.Parent.RemoveChild(n)
			}
			continue
		}
		filterAttrs(n, func(attr *html.Attribute) bool {
			for _, allowed := range allowedAttrs {
				if attr.Key == allowed.String() {
					return true
				}
			}
			return false
		})
	}

	// special pass for images - strip out ones with huge URIs (eg embedded
	// 'data:' + base64 encoded images)
	const maxSrcURI = 1024
	for _, img := range imgSel.MatchAll(node) {
		src := getAttr(img, "src")
		if len(src) > maxSrcURI {
			img.Parent.RemoveChild(img)
			continue
		}
	}
}