// 添加请求到队列 func (self *scheduler) Push(req *context.Request) { self.RWMutex.RLock() defer self.RWMutex.RUnlock() if self.status == status.STOP { return } // 当req不可重复时,有重复则返回 if !req.GetDuplicatable() && self.Deduplicate(req.GetUrl()+req.GetMethod()) { return } self.SrcManage.Push(req) }
// core processer func (self *crawler) Process(req *context.Request) { defer func() { if err := recover(); err != nil { // do not affect other scheduler.Sdl.DelDeduplication(req.GetUrl() + req.GetMethod()) // 统计失败数 cache.PageFailCount() // 提示错误 logs.Log.Error(" * Fail [process panic]: %v", err) } }() // download page resp := self.Downloader.Download(req) // if fail do not need process if resp.GetError() != nil { // 删除该请求的去重样本 scheduler.Sdl.DelDeduplication(req.GetUrl() + req.GetMethod()) // 统计失败数 cache.PageFailCount() // 提示错误 logs.Log.Error(" * Fail [download]: %v", resp.GetError()) return } // 过程处理,提炼数据 self.Spider.Parse(resp, resp.GetRuleName()) // 统计成功页数 cache.PageSuccCount() // 提示抓取成功 logs.Log.Informational(" * Success: %v", req.GetUrl()) // 该条请求文本结果存入pipeline for _, data := range resp.GetItems() { self.Pipeline.CollectData( resp.GetRuleName(), //DataCell.RuleName data, //DataCell.Data resp.GetUrl(), //DataCell.Url resp.GetReferer(), //DataCell.ParentUrl time.Now().Format("2006-01-02 15:04:05"), ) } // 该条请求文件结果存入pipeline for _, img := range resp.GetFiles() { self.Pipeline.CollectFile( resp.GetRuleName(), img["Name"].(string), img["Body"].(io.ReadCloser), ) } }