// grab text of matching selector, return as one line func sectionFromElement(root *html.Node, sel cascadia.Selector) string { el := sel.MatchFirst(root) if el == nil { return "" } return compressSpace(getTextContent(el)) }
func closest(n *html.Node, sel cascadia.Selector) *html.Node { for n != nil { if sel.Match(n) { break } n = n.Parent } return n }
// getElemText returns the innerText of the selected node. func getElemText(n *html.Node, sel cascadia.Selector) string { if node := sel.MatchFirst(n); node != nil { return htmlutil.InnerText(node) } return "" }
// Tidy up extracted content into something that'll produce reasonable html when // rendered // - remove comments // - trim empty text nodes // - TODO make links absolute func tidyNode(node *html.Node) { var commentSel cascadia.Selector = func(n *html.Node) bool { return n.Type == html.CommentNode } var textSel cascadia.Selector = func(n *html.Node) bool { return n.Type == html.TextNode } var elementSel cascadia.Selector = func(n *html.Node) bool { return n.Type == html.ElementNode } var imgSel cascadia.Selector = func(n *html.Node) bool { return n.Type == html.ElementNode && n.DataAtom == atom.Img } // remove all comments for _, n := range commentSel.MatchAll(node) { n.Parent.RemoveChild(n) } leadingSpace := regexp.MustCompile(`^\s+`) trailingSpace := regexp.MustCompile(`\s+$`) // trim excessive leading/trailing space in text nodes, and cull empty ones for _, n := range textSel.MatchAll(node) { txt := leadingSpace.ReplaceAllStringFunc(n.Data, func(in string) string { if strings.Contains(in, "\n") { return "\n" } else { return " " } }) txt = trailingSpace.ReplaceAllStringFunc(n.Data, func(in string) string { if strings.Contains(in, "\n") { return "\n" } else { return " " } }) if len(strings.TrimSpace(txt)) == 0 { n.Parent.RemoveChild(n) } else { n.Data = txt } } // remove any elements or attrs not on the whitelist for _, n := range elementSel.MatchAll(node) { allowedAttrs, whiteListed := elementWhitelist[n.DataAtom] if !whiteListed { if n.Parent != nil { n.Parent.RemoveChild(n) } continue } filterAttrs(n, func(attr *html.Attribute) bool { for _, allowed := range allowedAttrs { if attr.Key == allowed.String() { return true } } return false }) } // special pass for images - strip out ones with huge URIs (eg embedded // 'data:' + base64 encoded images) const maxSrcURI = 1024 for _, img := range imgSel.MatchAll(node) { src := getAttr(img, "src") if len(src) > maxSrcURI { img.Parent.RemoveChild(img) continue } } }