// 添加请求到队列 func (self *scheduler) Push(req *context.Request) { // 初始化该蜘蛛的队列 spiderId, ok := req.GetSpiderId() if !ok { return } self.RLock() defer self.RUnlock() if self.status == status.STOP { return } // 当req不可重复时,有重复则返回 if !req.GetDuplicatable() && self.Deduplicate(req.GetUrl()+req.GetMethod()) { return } // 初始化该蜘蛛下该优先级队列 priority := req.GetPriority() if !self.foundPriority(spiderId, priority) { self.addPriority(spiderId, priority) } defer func() { recover() }() // 添加请求到队列 self.queue[spiderId][priority] = append(self.queue[spiderId][priority], req) }
func (self *Surfer) Download(cReq *context.Request) *context.Response { cResp := context.NewResponse(nil) var resp *http.Response var err error switch cReq.GetDownloaderID() { case SURF_ID: resp, err = self.surf.Download(cReq) case PHANTOM_ID: resp, err = self.phantom.Download(cReq) } if resp != nil { // 确保Response与Request中的Url字符串相等 resp.Request.URL, _ = url.Parse(cReq.GetUrl()) } cResp.SetRequest(cReq) cResp.SetResponse(resp) cResp.SetError(err) return cResp }
// 添加请求到队列 func (self *Matrix) Push(req *context.Request) { sdl.RLock() defer sdl.RUnlock() if sdl.status == status.STOP || self.maxPage >= 0 || // 当req不可重复下载时,已存在成功记录则返回 !req.IsReloadable() && !UpsertSuccess(req) { return } // 大致限制加入队列的请求量,并发情况下应该会比maxPage多 atomic.AddInt64(&self.maxPage, 1) priority := req.GetPriority() // 初始化该蜘蛛下该优先级队列 if _, found := self.reqs[priority]; !found { self.priorities = append(self.priorities, priority) sort.Ints(self.priorities) // 从小到大排序 self.reqs[priority] = []*context.Request{} } // 添加请求到队列 self.reqs[priority] = append(self.reqs[priority], req) }
// 批量url生成请求,并添加至队列。 func (self *Context) BulkAddQueue(urls []string, req *context.Request) *Context { for _, url := range urls { req.SetUrl(url) self.AddQueue(req) } return self }
// core processer func (self *crawler) Process(req *context.Request) { defer func() { if err := recover(); err != nil { // do not affect other scheduler.Sdl.DelDeduplication(req.GetUrl() + req.GetMethod()) // 统计失败数 cache.PageFailCount() // 提示错误 logs.Log.Error(" * Fail [process panic]: %v", err) } }() // download page resp := self.Downloader.Download(req) // if fail do not need process if resp.GetError() != nil { // 删除该请求的去重样本 scheduler.Sdl.DelDeduplication(req.GetUrl() + req.GetMethod()) // 统计失败数 cache.PageFailCount() // 提示错误 logs.Log.Error(" * Fail [download]: %v", resp.GetError()) return } // 过程处理,提炼数据 spider.NewContext(self.Spider, resp).Parse(resp.GetRuleName()) // 统计成功页数 cache.PageSuccCount() // 提示抓取成功 logs.Log.Informational(" * Success: %v", req.GetUrl()) // 该条请求文本结果存入pipeline for _, data := range resp.GetItems() { self.Pipeline.CollectData( resp.GetRuleName(), //DataCell.RuleName data, //DataCell.Data resp.GetUrl(), //DataCell.Url resp.GetReferer(), //DataCell.ParentUrl time.Now().Format("2006-01-02 15:04:05"), ) } // 该条请求文件结果存入pipeline for _, img := range resp.GetFiles() { self.Pipeline.CollectFile( resp.GetRuleName(), img["Name"].(string), img["Body"].(io.ReadCloser), ) } }
func (self *Matrix) SetFailure(req *context.Request) bool { self.Lock() defer self.Unlock() unique := makeUnique(req) if _, ok := self.failures[unique]; !ok { // 首次失败时,在任务队列末尾重新执行一次 self.failures[unique] = req logs.Log.Informational(" * + 失败请求: [%v]\n", req.GetUrl()) return true } // 失败两次后,加入历史失败记录 UpsertFailure(req) return false }
// core processer func (self *crawler) Process(req *context.Request) { defer func() { if err := recover(); err != nil { // do not affect other logs.Log.Error(" * Process panic: %v", err) } }() // logs.Log.Debug("**************断点 1 ***********") // download page resp := self.Downloader.Download(req) // logs.Log.Debug("**************断点 2 ***********") // if fail do not need process if resp.GetError() != nil { // 取消该请求的去重样本 scheduler.Sdl.DelDeduplication(req.GetUrl() + req.GetMethod()) logs.Log.Error(" * %v", resp.GetError()) // 统计下载失败的页数 cache.PageFailCount() return } // logs.Log.Debug("**************断点 3 ***********") // 过程处理,提炼数据 self.Spider.ExecParse(resp) // logs.Log.Debug("**************断点 5 ***********") // 该条请求文本结果存入pipeline for _, data := range resp.GetItems() { self.Pipeline.CollectData( resp.GetRuleName(), //DataCell.RuleName data, //DataCell.Data resp.GetUrl(), //DataCell.Url resp.GetReferer(), //DataCell.ParentUrl time.Now().Format("2006-01-02 15:04:05"), ) } // 该条请求文件结果存入pipeline for _, img := range resp.GetFiles() { self.Pipeline.CollectFile( resp.GetRuleName(), img["Name"].(string), img["Body"].(io.ReadCloser), ) } // logs.Log.Debug("**************断点 end ***********") }
// 更新或加入失败记录 // 对比是否已存在,不存在就记录 func (self *Failure) UpsertFailure(req *context.Request) bool { self.RWMutex.Lock() defer self.RWMutex.Unlock() spName := req.GetSpiderName() s := req.Serialize() if failures, ok := self.list[spName]; !ok { self.list[spName] = make(map[string]bool) } else if failures[s] { return false } self.list[spName][s] = true return true }
// 生成并添加请求至队列 // Request.Url与Request.Rule必须设置 // Request.Spider无需手动设置(由系统自动设置) // Request.EnableCookie在Spider字段中统一设置,规则请求中指定的无效 // 以下字段有默认值,可不设置: // Request.Method默认为GET方法; // Request.DialTimeout默认为常量context.DefaultDialTimeout,小于0时不限制等待响应时长; // Request.ConnTimeout默认为常量context.DefaultConnTimeout,小于0时不限制下载超时; // Request.TryTimes默认为常量context.DefaultTryTimes; // Request.RedirectTimes默认不限制重定向次数,小于0时可禁止重定向跳转; // Request.RetryPause默认为常量context.DefaultRetryPause. func (self *Spider) AddQueue(req *context.Request) { req. SetSpiderName(self.Name). SetSpiderId(self.GetId()). SetEnableCookie(self.EnableCookie). Prepare() scheduler.Sdl.Push(req) }
// 生成并添加请求至队列。 // Request.Url与Request.Rule必须设置。 // Request.Spider无需手动设置(由系统自动设置)。 // Request.EnableCookie在Spider字段中统一设置,规则请求中指定的无效。 // 以下字段有默认值,可不设置: // Request.Method默认为GET方法; // Request.DialTimeout默认为常量context.DefaultDialTimeout,小于0时不限制等待响应时长; // Request.ConnTimeout默认为常量context.DefaultConnTimeout,小于0时不限制下载超时; // Request.TryTimes默认为常量context.DefaultTryTimes,小于0时不限制失败重载次数; // Request.RedirectTimes默认不限制重定向次数,小于0时可禁止重定向跳转; // Request.RetryPause默认为常量context.DefaultRetryPause; // Request.DownloaderID指定下载器ID,0为默认的Surf高并发下载器,功能完备,1为PhantomJS下载器,特点破防力强,速度慢,低并发。 // 默认自动补填Referer。 func (self *Context) AddQueue(req *context.Request) *Context { err := req. SetSpiderName(self.Spider.GetName()). SetSpiderId(self.Spider.GetId()). SetEnableCookie(self.Spider.GetEnableCookie()). Prepare() if err != nil { logs.Log.Error("%v", err) return self } if req.GetReferer() == "" && self.Response != nil { req.SetReferer(self.Response.GetUrl()) } scheduler.Sdl.Push(req) return self }
func (self *Surfer) Download(cReq *context.Request) *context.Response { cResp := context.NewResponse(nil) var resp *http.Response var err error switch cReq.GetDownloaderID() { case SURF_ID: resp, err = self.surf.Download(cReq) case PHANTOM_ID: resp, err = self.phantom.Download(cReq) } cResp.Prepare(resp, cReq) cResp.SetError(err) return cResp }
func (self *Surfer) Download(cReq *context.Request) *context.Response { cResp := context.NewResponse(nil) var resp *http.Response var err error if cReq.GetUsePhantomJS() { resp, err = self.phantom.Download(cReq) } else { resp, err = self.surf.Download(cReq) } cResp.SetRequest(cReq) cResp.SetResponse(resp) cResp.SetError(err) return cResp }
// 添加请求到队列 func (self *scheduler) Push(req *context.Request) { pushMutex.Lock() defer func() { pushMutex.Unlock() }() if self.status == status.STOP { return } // 有重复则返回 if self.Compare(req.GetUrl() + req.GetMethod()) { return } // 留作未来分发请求用 // if pholcus.Self.GetRunMode() == config.SERVER || req.CanOutsource() { // return // } self.SrcManage.Push(req) }
func (self *SrcManage) Push(req *context.Request) { spiderId, ok := req.GetSpiderId() if !ok { return } // 初始化该蜘蛛的队列 if _, ok := self.queue[spiderId]; !ok { self.mutex[spiderId] = new(sync.Mutex) self.queue[spiderId] = make(map[int][]*context.Request) } priority := req.GetPriority() // 登记该蜘蛛下该优先级队列 if _, ok := self.queue[spiderId][priority]; !ok { self.uIndex(spiderId, priority) } // 添加请求到队列 self.queue[spiderId][priority] = append(self.queue[spiderId][priority], req) }
func (self *Surfer) Download(cReq *context.Request) *context.Response { cResp := context.NewResponse(nil) resp, err := self.download.Download(cReq.GetMethod(), cReq.GetUrl(), cReq.GetReferer(), cReq.GetPostData(), cReq.GetHeader(), cReq.GetCookies()) cResp.SetRequest(cReq) cResp.SetResponse(resp) cResp.SetError(err) return cResp }
// 添加请求到队列 func (self *scheduler) Push(req *context.Request) { self.pushMutex.Lock() defer self.pushMutex.Unlock() if self.status == status.STOP { return } // 当req不可重复时,有重复则返回 if !req.GetDuplicatable() && self.Deduplicate(req.GetUrl()+req.GetMethod()) { return } self.SrcManage.Push(req) }
func (self *Surfer) Download(cReq *context.Request) *context.Response { cResp := context.NewResponse(nil) resp, err := self.download.Download(cReq.GetMethod(), cReq.GetUrl(), cReq.GetReferer(), cReq.GetPostData(), cReq.GetHeader(), cReq.GetCookies()) cResp.SetRequest(cReq) cResp.SetResponse(resp) if err != nil { logs.Log.Error(" * %v", err) // cResp.SetStatus(false, err.Error()) // return cResp } cResp.SetStatus(true, "") return cResp }
func makeUnique(req *context.Request) string { return util.MakeUnique(req.GetUrl() + req.GetMethod()) }
// 批量url生成请求,并添加至队列 func (self *Spider) BulkAddQueue(urls []string, req *context.Request) { for _, url := range urls { req.SetUrl(url) self.AddQueue(req) } }
// 删除失败记录 func (self *Failure) DeleteFailure(req *context.Request) { self.RWMutex.Lock() s := req.Serialize() delete(self.list[req.GetSpiderName()], s) self.RWMutex.Unlock() }