func (dl *myPageDownloader) Download(req base.Request) (*base.Response, error) { httpReq := req.HttpReq() log.Info("Do the request (url=%s)... \n", httpReq.URL) httpResp, err := dl.httpClient.Do(httpReq) if err != nil { return nil, err } return base.NewResponse(httpResp, req.Depth()), nil }
// 把请求存放到请求缓存。 func (sched *myScheduler) saveReqToCache(req base.Request, code string) bool { httpReq := req.HttpReq() if httpReq == nil { log.Warnf("Ignore the request! It's HTTP request is invalid!") return false } reqUrl := httpReq.URL if reqUrl == nil { log.Warnf("Ignore the request! It's url is is invalid!") return false } if strings.ToLower(reqUrl.Scheme) != "http" { log.Warnf("Ignore the request! It's url scheme '%s', but should be 'http'!\n", reqUrl.Scheme) return false } if _, ok := sched.urlMap[reqUrl.String()]; ok { log.Warnf("Ignore the request! It's url is repeated. (requestUrl=%s)\n", reqUrl) return false } if pd, _ := getPrimaryDomain(httpReq.Host); pd != sched.primaryDomain { log.Warnf("Ignore the request! It's host '%s' not in primary domain '%s'. (requestUrl=%s)\n", httpReq.Host, sched.primaryDomain, reqUrl) return false } if req.Depth() > sched.crawlDepth { log.Warnf("Ignore the request! It's depth %d greater than %d. (requestUrl=%s)\n", req.Depth(), sched.crawlDepth, reqUrl) return false } if sched.stopSign.Signed() { sched.stopSign.Deal(code) return false } sched.reqCache.put(&req) sched.urlMap[reqUrl.String()] = true return true }