Esempio n. 1
0
// Handle implements Controller.
func (mux *Mux) Handle(r *crawler.Response, ch chan<- *url.URL) {
	url := r.URL.String()
	if f, ok := mux.matcher[muxHANDLE].Get(url); ok {
		f.(Handler).Handle(r, ch)
	} else {
		depth := r.Context().Depth()
		if mux.follow(r, depth) {
			crawler.ExtractHref(r.NewURL, r.Body, ch)
		}
	}
}
Esempio n. 2
0
// Schedule implements Controller.
func (mux *Mux) Resched(r *crawler.Response) (done bool, ticket crawler.Ticket) {
	url := r.URL.String()
	ctx := r.Context()
	if t, ok := mux.matcher[muxFREQ].Get(url); ok {
		if cnt, err := ctx.NumVisit(); err != nil || cnt >= t.(int) {
			done = true
			return
		}
	} else if cnt, err := ctx.NumVisit(); err != nil || cnt >= 1 {
		done = true
		return
	}
	if sc, ok := mux.matcher[muxSCORE].Get(url); ok {
		ticket.Score = sc.(int)
	}
	return
}
Esempio n. 3
0
// Extract parses the HTML document, extracts URLs and filters them using
// the matcher.
func (e *Extractor) Extract(
	r *crawler.Response, body io.Reader, ch chan<- *url.URL,
) error {
	if e.MaxDepth > 0 {
		if r.Context().Depth() >= e.MaxDepth {
			return nil
		}
	}
	chURL := make(chan *url.URL, 32)
	if e.Redirect {
		newurl := *r.NewURL
		chURL <- &newurl
		if r.Refresh.URL != nil {
			refresh := *r.Refresh.URL
			chURL <- &refresh
		}
	}
	chErr := make(chan error, 1)
	go e.tokenLoop(r, body, chURL, chErr)

	scheme, host := r.URL.Scheme, r.URL.Host
	for u := range chURL {
		if e.SameOrigin && u.Scheme != scheme {
			continue
		} else if !e.SpanHosts && u.Host != host {
			continue
		} else if e.SpanHosts && u.Host != host {
			if e.SubDomain {
				hs := strings.Split(host, ".")
				us := strings.Split(u.Host, ".")
				if lh, lu := len(hs), len(us); lh > 1 && lu > 1 {
					if hs[lh-2] == us[lu-2] && hs[lh-1] == us[lu-1] {
						goto MATCH
					}
				}
			}
			if e.ResolveIP {
				if ip0, err := net.LookupIP(host); err != nil {
					continue
				} else if ip1, err := net.LookupIP(u.Host); err != nil {
					continue
				} else {
					for _, i0 := range ip0 {
						for _, i1 := range ip1 {
							if i0.Equal(i1) {
								goto MATCH
							}
						}
					}
				}
			}
			if !e.Matcher.MatchPart(u, PartHost) {
				continue
			}
		}
	MATCH:
		if e.Matcher.Match(u) {
			ch <- u
		}
	}
	return <-chErr
}