Пример #1
0
// compileMatcher compiles the selector string s and returns
// the corresponding Matcher. If s is an invalid selector string,
// it returns a Matcher that fails all matches.
func compileMatcher(s string) Matcher {
	cs, err := cascadia.Compile(s)
	if err != nil {
		return invalidMatcher{}
	}
	return cs
}
Пример #2
0
func newSelector(s string) (selector, error) {
	cs, err := cascadia.Compile(s)
	if err != nil {
		return selector{}, err
	}
	return selector{
		Selector: cs,
		s:        s,
	}, nil
}
Пример #3
0
func Fuzz(data []byte) int {
	sel, err := cascadia.Compile(string(data))
	if err != nil {
		if sel != nil {
			panic("sel != nil on error")
		}
		return 0
	}
	return 1
}
Пример #4
0
// If the opengraph image exists on the page, that's probably the comic
func (e extractComic) checkOpenGraph(a *Article) bool {
	ogimg := a.Meta.OpenGraph["image"]
	if ogimg == "" {
		return false
	}

	m, err := cascadia.Compile(fmt.Sprintf("img[src=\"%s\"]", ogimg))
	if err != nil {
		return false
	}

	return e.setImage(a, a.Doc.FindMatcher(m))
}
Пример #5
0
// Extract - Selects html node by Path and extracts using Extractor.
func (s *Selector) Extract(node *html.Node) interface{} {
	// We can't do magic (hehe...)
	if node == nil {
		return nil
	}

	// If no Extractor was set we will extract text from this node :)
	if s.Extractor == nil {
		s.Extractor = TextExtractor
	}

	// If path is empty extract on current node
	if s.Path == "" {
		return s.Extractor.Extract(node)
	}

	// Compile Path to real selector
	selector, err := cascadia.Compile(s.Path)
	if err != nil {
		return nil
	}

	// Select all nodes
	nodes := selector.MatchAll(node)

	// If no nodes was found return nil
	if len(nodes) == 0 {
		return nil
	}

	// If found only one node extract first
	if len(nodes) == 1 {
		first := nodes[0]
		return s.Extractor.Extract(first)
	}

	// Extract from all found nodes
	result := make([]interface{}, len(nodes))
	for num, n := range nodes {
		result[num] = s.Extractor.Extract(n)
	}

	return result
}
Пример #6
0
func NewDiscoverer(cfg DiscovererDef) (*Discoverer, error) {
	disc := &Discoverer{}
	u, err := url.Parse(cfg.URL)
	if err != nil {
		return nil, err
	}
	disc.Name = cfg.Name
	disc.StartURL = *u
	disc.ArtPats = make([]*regexp.Regexp, 0, len(cfg.ArtPat))
	for _, pat := range cfg.ArtPat {
		re, err := regexp.Compile(pat)
		if err != nil {
			return nil, err
		}
		disc.ArtPats = append(disc.ArtPats, re)
	}

	if cfg.NavSel == "" {
		disc.NavLinkSel = nil
	} else {
		sel, err := cascadia.Compile(cfg.NavSel)
		if err != nil {
			return nil, err
		}
		disc.NavLinkSel = sel
	}
	disc.BaseErrorThreshold = cfg.BaseErrorThreshold

	if cfg.HostPat != "" {
		re, err := regexp.Compile(cfg.HostPat)
		if err != nil {
			return nil, err
		}
		disc.HostPat = re
	}

	// defaults
	disc.StripFragments = true
	disc.StripQuery = !cfg.NoStripQuery
	disc.ErrorLog = NullLogger{}
	disc.InfoLog = NullLogger{}
	return disc, nil
}
Пример #7
0
func main() {
	x := util.CheckErr

	resp, err := http.Get("http://pebbe.tumblr.com")
	x(err)
	doc, err := html.Parse(resp.Body)
	x(err)
	resp.Body.Close()

	sel, err := cascadia.Compile("div.post")
	x(err)

	for i, n := range sel.MatchAll(doc) {
		var b bytes.Buffer
		html.Render(&b, n)
		fmt.Println(i, b.String())
	}

}
Пример #8
0
// Select from the retrived page source the CSS selection defined in c4c.ini.
func (p *Page) makeSelection(htmlNode *html.Node) (selection string, err error) {

	// --- [ CSS selection ] --------------------------------------------------/

	// Write results into an array of nodes.
	var result []*html.Node

	// Append the whole page (htmlNode) to results if no selector where chosen.
	if p.Settings.Selection == "" {
		result = append(result, htmlNode)
	} else {

		// Make a selector from the user specified string.
		s, err := cascadia.Compile(p.Settings.Selection)
		if err != nil {
			return "", errutil.Err(err)
		}

		// Find all nodes that matches selection s.
		result = s.MatchAll(htmlNode)
	}

	// Loop through all the hits and render them to string.
	for _, hit := range result {
		s, err := htmlutil.RenderClean(hit)
		if err != nil {
			return "", errutil.Err(err)
		}
		selection += s
	}

	// --- [ /CSS selection ] -------------------------------------------------/

	// --- [ Strip funcs ] ----------------------------------------------------/

	for _, stripFunc := range p.Settings.StripFuncs {
		doc, err := html.Parse(strings.NewReader(selection))
		if err != nil {
			return "", errutil.Err(err)
		}
		stripFunc = strings.ToLower(stripFunc)
		switch stripFunc {
		case "numbers":
			strip.Numbers(doc)
		case "attrs":
			strip.Attrs(doc)
		case "html":
			strip.HTML(doc)
		case "scripts":
			strip.Scripts(doc)
		}

		selection, err = htmlutil.RenderClean(doc)
		if err != nil {
			return "", errutil.Err(err)
		}
	}

	// --- [ /Strip funcs ] ---------------------------------------------------/

	// --- [ Regexp ] ---------------------------------------------------------/

	if p.Settings.Regexp != "" {
		re, err := regexp.Compile(p.Settings.Regexp)
		if err != nil {
			return "", errutil.Err(err)
		}

		// -1 means to find all.
		result := re.FindAllString(selection, -1)

		selection = ""
		for _, res := range result {
			selection += res + settings.Newline
		}
	}

	// --- [ /Regexp ] --------------------------------------------------------/

	// --- [ Negexp ] ---------------------------------------------------------/

	if p.Settings.Negexp != "" {
		ne, err := regexp.Compile(p.Settings.Negexp)
		if err != nil {
			return "", errutil.Err(err)
		}

		// Remove all that matches the regular expression ne
		selection = ne.ReplaceAllString(selection, "")
	}

	// --- [ /Negexp ] --------------------------------------------------------/

	return selection, nil
}