// Handle implements Controller. func (mux *Mux) Handle(r *crawler.Response, ch chan<- *url.URL) { url := r.URL.String() if f, ok := mux.matcher[muxHANDLE].Get(url); ok { f.(Handler).Handle(r, ch) } else { depth := r.Context().Depth() if mux.follow(r, depth) { crawler.ExtractHref(r.NewURL, r.Body, ch) } } }
// Schedule implements Controller. func (mux *Mux) Resched(r *crawler.Response) (done bool, ticket crawler.Ticket) { url := r.URL.String() ctx := r.Context() if t, ok := mux.matcher[muxFREQ].Get(url); ok { if cnt, err := ctx.NumVisit(); err != nil || cnt >= t.(int) { done = true return } } else if cnt, err := ctx.NumVisit(); err != nil || cnt >= 1 { done = true return } if sc, ok := mux.matcher[muxSCORE].Get(url); ok { ticket.Score = sc.(int) } return }
// Extract parses the HTML document, extracts URLs and filters them using // the matcher. func (e *Extractor) Extract( r *crawler.Response, body io.Reader, ch chan<- *url.URL, ) error { if e.MaxDepth > 0 { if r.Context().Depth() >= e.MaxDepth { return nil } } chURL := make(chan *url.URL, 32) if e.Redirect { newurl := *r.NewURL chURL <- &newurl if r.Refresh.URL != nil { refresh := *r.Refresh.URL chURL <- &refresh } } chErr := make(chan error, 1) go e.tokenLoop(r, body, chURL, chErr) scheme, host := r.URL.Scheme, r.URL.Host for u := range chURL { if e.SameOrigin && u.Scheme != scheme { continue } else if !e.SpanHosts && u.Host != host { continue } else if e.SpanHosts && u.Host != host { if e.SubDomain { hs := strings.Split(host, ".") us := strings.Split(u.Host, ".") if lh, lu := len(hs), len(us); lh > 1 && lu > 1 { if hs[lh-2] == us[lu-2] && hs[lh-1] == us[lu-1] { goto MATCH } } } if e.ResolveIP { if ip0, err := net.LookupIP(host); err != nil { continue } else if ip1, err := net.LookupIP(u.Host); err != nil { continue } else { for _, i0 := range ip0 { for _, i1 := range ip1 { if i0.Equal(i1) { goto MATCH } } } } } if !e.Matcher.MatchPart(u, PartHost) { continue } } MATCH: if e.Matcher.Match(u) { ch <- u } } return <-chErr }