// 添加请求值或条目值到列表。 func appendDataList(dataList []base.Data, data base.Data, respDepth uint32) []base.Data { if data == nil { return dataList } req, ok := data.(*base.Request) if !ok { return append(dataList, data) } newDepth := respDepth + 1 if req.Depth() != newDepth { req = base.NewRequest(req.HttpReq(), newDepth) } return append(dataList, req) }
// 响应解析函数。只解析“A”标签。 func parseForATag(httpResp *http.Response, respDepth uint32) ([]base.Data, []error) { // TODO 支持更多的HTTP响应状态 if httpResp.StatusCode != 200 { err := errors.New( fmt.Sprintf("Unsupported status code %d. (httpResponse=%v)", httpResp)) return nil, []error{err} } var reqUrl *url.URL = httpResp.Request.URL var httpRespBody io.ReadCloser = httpResp.Body defer func() { if httpRespBody != nil { httpRespBody.Close() } }() dataList := make([]base.Data, 0) errs := make([]error, 0) // 开始解析 doc, err := goquery.NewDocumentFromReader(httpRespBody) if err != nil { errs = append(errs, err) return dataList, errs } // 查找“A”标签并提取链接地址 doc.Find("a").Each(func(index int, sel *goquery.Selection) { href, exists := sel.Attr("href") // 前期过滤 if !exists || href == "" || href == "#" || href == "/" { return } href = strings.TrimSpace(href) lowerHref := strings.ToLower(href) // 暂不支持对Javascript代码的解析。 if href != "" && !strings.HasPrefix(lowerHref, "javascript") { aUrl, err := url.Parse(href) if err != nil { errs = append(errs, err) return } if !aUrl.IsAbs() { aUrl = reqUrl.ResolveReference(aUrl) } httpReq, err := http.NewRequest("GET", aUrl.String(), nil) if err != nil { errs = append(errs, err) } else { req := base.NewRequest(httpReq, respDepth) dataList = append(dataList, req) } } text := strings.TrimSpace(sel.Text()) if text != "" { imap := make(map[string]interface{}) imap["parent_url"] = reqUrl imap["a.text"] = text imap["a.index"] = index item := base.Item(imap) dataList = append(dataList, &item) } }) return dataList, errs }
func (sched *myScheduler) Start( channelArgs base.ChannelArgs, poolBaseArgs base.PoolBaseArgs, crawlDepth uint32, httpClientGenerator GenHttpClient, respParsers []anlz.ParseResponse, itemProcessors []ipl.ProcessItem, firstHttpReq *http.Request) (err error) { defer func() { if p := recover(); p != nil { errMsg := fmt.Sprintf("Fatal Scheduler Error: %s\n", p) log.Error(errMsg) err = errors.New(errMsg) } }() if atomic.LoadUint32(&sched.running) == 1 { return errors.New("The scheduler has been started!\n") } atomic.StoreUint32(&sched.running, 1) if err := channelArgs.Check(); err != nil { return err } sched.channelArgs = channelArgs if err := poolBaseArgs.Check(); err != nil { return err } sched.poolBaseArgs = poolBaseArgs sched.crawlDepth = crawlDepth sched.chanman = generateChannelManager(sched.channelArgs) if httpClientGenerator == nil { return errors.New("The HTTP client generator list is invalid!") } dlpool, err := generatePageDownloaderPool( sched.poolBaseArgs.PageDownloaderPoolSize(), httpClientGenerator) if err != nil { errMsg := fmt.Sprintf("Occur error when get page downloader pool: %s\n", err) return errors.New(errMsg) } sched.dlpool = dlpool analyzerPool, err := generateAnalyzerPool(sched.poolBaseArgs.AnalyzerPoolSize()) if err != nil { if err != nil { errMsg := fmt.Sprintf("Occur error when get analyzer pool: %s\n", err) return errors.New(errMsg) } } sched.analyzerPool = analyzerPool if itemProcessors == nil { return errors.New("The item processor list is invalid!") } for i, ip := range itemProcessors { if ip == nil { return errors.New(fmt.Sprintf("The %dth item processor is invalid!", i)) } } sched.itemPipeline = generateItemPipeline(itemProcessors) if sched.stopSign == nil { sched.stopSign = mdw.NewStopSign() } else { sched.stopSign.Reset() } sched.reqCache = newRequestCache() sched.urlMap = make(map[string]bool) sched.startDownloading() sched.activateAnalyzers(respParsers) sched.openItemPipeline() sched.schedule(10 * time.Millisecond) if firstHttpReq == nil { return errors.New("The first HTTP request is invalid!") } pd, err := getPrimaryDomain(firstHttpReq.Host) if err != nil { return err } sched.primaryDomain = pd firstReq := base.NewRequest(firstHttpReq, 0) sched.reqCache.put(firstReq) return nil }