// compileMatcher compiles the selector string s and returns // the corresponding Matcher. If s is an invalid selector string, // it returns a Matcher that fails all matches. func compileMatcher(s string) Matcher { cs, err := cascadia.Compile(s) if err != nil { return invalidMatcher{} } return cs }
func newSelector(s string) (selector, error) { cs, err := cascadia.Compile(s) if err != nil { return selector{}, err } return selector{ Selector: cs, s: s, }, nil }
func Fuzz(data []byte) int { sel, err := cascadia.Compile(string(data)) if err != nil { if sel != nil { panic("sel != nil on error") } return 0 } return 1 }
// If the opengraph image exists on the page, that's probably the comic func (e extractComic) checkOpenGraph(a *Article) bool { ogimg := a.Meta.OpenGraph["image"] if ogimg == "" { return false } m, err := cascadia.Compile(fmt.Sprintf("img[src=\"%s\"]", ogimg)) if err != nil { return false } return e.setImage(a, a.Doc.FindMatcher(m)) }
// Extract - Selects html node by Path and extracts using Extractor. func (s *Selector) Extract(node *html.Node) interface{} { // We can't do magic (hehe...) if node == nil { return nil } // If no Extractor was set we will extract text from this node :) if s.Extractor == nil { s.Extractor = TextExtractor } // If path is empty extract on current node if s.Path == "" { return s.Extractor.Extract(node) } // Compile Path to real selector selector, err := cascadia.Compile(s.Path) if err != nil { return nil } // Select all nodes nodes := selector.MatchAll(node) // If no nodes was found return nil if len(nodes) == 0 { return nil } // If found only one node extract first if len(nodes) == 1 { first := nodes[0] return s.Extractor.Extract(first) } // Extract from all found nodes result := make([]interface{}, len(nodes)) for num, n := range nodes { result[num] = s.Extractor.Extract(n) } return result }
func NewDiscoverer(cfg DiscovererDef) (*Discoverer, error) { disc := &Discoverer{} u, err := url.Parse(cfg.URL) if err != nil { return nil, err } disc.Name = cfg.Name disc.StartURL = *u disc.ArtPats = make([]*regexp.Regexp, 0, len(cfg.ArtPat)) for _, pat := range cfg.ArtPat { re, err := regexp.Compile(pat) if err != nil { return nil, err } disc.ArtPats = append(disc.ArtPats, re) } if cfg.NavSel == "" { disc.NavLinkSel = nil } else { sel, err := cascadia.Compile(cfg.NavSel) if err != nil { return nil, err } disc.NavLinkSel = sel } disc.BaseErrorThreshold = cfg.BaseErrorThreshold if cfg.HostPat != "" { re, err := regexp.Compile(cfg.HostPat) if err != nil { return nil, err } disc.HostPat = re } // defaults disc.StripFragments = true disc.StripQuery = !cfg.NoStripQuery disc.ErrorLog = NullLogger{} disc.InfoLog = NullLogger{} return disc, nil }
func main() { x := util.CheckErr resp, err := http.Get("http://pebbe.tumblr.com") x(err) doc, err := html.Parse(resp.Body) x(err) resp.Body.Close() sel, err := cascadia.Compile("div.post") x(err) for i, n := range sel.MatchAll(doc) { var b bytes.Buffer html.Render(&b, n) fmt.Println(i, b.String()) } }
// Select from the retrived page source the CSS selection defined in c4c.ini. func (p *Page) makeSelection(htmlNode *html.Node) (selection string, err error) { // --- [ CSS selection ] --------------------------------------------------/ // Write results into an array of nodes. var result []*html.Node // Append the whole page (htmlNode) to results if no selector where chosen. if p.Settings.Selection == "" { result = append(result, htmlNode) } else { // Make a selector from the user specified string. s, err := cascadia.Compile(p.Settings.Selection) if err != nil { return "", errutil.Err(err) } // Find all nodes that matches selection s. result = s.MatchAll(htmlNode) } // Loop through all the hits and render them to string. for _, hit := range result { s, err := htmlutil.RenderClean(hit) if err != nil { return "", errutil.Err(err) } selection += s } // --- [ /CSS selection ] -------------------------------------------------/ // --- [ Strip funcs ] ----------------------------------------------------/ for _, stripFunc := range p.Settings.StripFuncs { doc, err := html.Parse(strings.NewReader(selection)) if err != nil { return "", errutil.Err(err) } stripFunc = strings.ToLower(stripFunc) switch stripFunc { case "numbers": strip.Numbers(doc) case "attrs": strip.Attrs(doc) case "html": strip.HTML(doc) case "scripts": strip.Scripts(doc) } selection, err = htmlutil.RenderClean(doc) if err != nil { return "", errutil.Err(err) } } // --- [ /Strip funcs ] ---------------------------------------------------/ // --- [ Regexp ] ---------------------------------------------------------/ if p.Settings.Regexp != "" { re, err := regexp.Compile(p.Settings.Regexp) if err != nil { return "", errutil.Err(err) } // -1 means to find all. result := re.FindAllString(selection, -1) selection = "" for _, res := range result { selection += res + settings.Newline } } // --- [ /Regexp ] --------------------------------------------------------/ // --- [ Negexp ] ---------------------------------------------------------/ if p.Settings.Negexp != "" { ne, err := regexp.Compile(p.Settings.Negexp) if err != nil { return "", errutil.Err(err) } // Remove all that matches the regular expression ne selection = ne.ReplaceAllString(selection, "") } // --- [ /Negexp ] --------------------------------------------------------/ return selection, nil }