func (this *Spider) pageProcess(req *page.Request) { var p *page.Page //下载页面 for i := 0; i < 3; i++ { p = this.m_downLoader.DownLoad(req) if p.IsSucc() { break } time.Sleep(time.Microsecond * 1000) } if !p.IsSucc() { this.finishForReqProcesser(req.GetUrl()) return } //分析页面内容 this.m_pageProcesser.Process(p) //获取新的链接 if p.CountNewUrls() > 0 { newUrls := p.GetNewUrls() for tmpUrl, tmpUrlTag := range newUrls { this.AddUrl(tmpUrl, "html", tmpUrlTag) } } this.finishForReqProcesser(req.GetUrl()) //输出 for _, tmpOut := range this.m_outputs { tmpOut.Process(p.GetPageItemsList(), p.GetRequest().GetUrl()) } }
func (this *Spider) Run() *Spider { for { var req *page.Request = this.m_scheduler.Poll() if req == nil { if this.countHandlingUrl() == 0 { break } time.Sleep(500 * time.Millisecond) //这里需要判断有没有没有处理完的请求,如果都处理完了可以退出,如果还没有处理完,那么继续等待下一个时间片段 continue } //req.GetUrl this.waitForReqProcesser(req.GetUrl(), req.GetUrlTag()) go func(*page.Request) { //deal the page,may get new pages this.pageProcess(req) }(req) } //运行结束 return this }
func (this *Spider) AddRequest(req *page.Request) *Spider { if req == nil { logs.GetFirstLogger().Error("request is nil") return this } else if req.GetUrl() == "" { logs.GetFirstLogger().Error("request is empty") } this.m_scheduler.Push(req) return this }
func (this *HttpDownLoader) DownLoad(req *page.Request) *page.Page { var p = page.NewPage(req) var respType = req.GetRespType() switch respType { case "html": return this.downloadHtml(p, req) default: logs.GetFirstLogger().Error("error request type : " + respType) } return nil }
//下载文件,并对字符编码做相应的处理 func (this *HttpDownLoader) downloadFile(p *page.Page, req *page.Request) (*page.Page, string) { var err error var httpResp *http.Response var urlStr string var method string urlStr = req.GetUrl() if len(urlStr) == 0 { logs.GetFirstLogger().Error("url is empty") p.SetStatus(true, "url is empty") return p, "" } method = req.GetMethod() if method == "POST" { httpResp, err = http.Post(req.GetUrl(), "application/x-www-form-urlencoded", strings.NewReader(req.GetPostData())) } else { httpResp, err = http.Get(req.GetUrl()) } if err != nil { logs.GetFirstLogger().Error("http visit error :" + err.Error()) p.SetStatus(true, err.Error()) } p.SetHeader(httpResp.Header) p.SetCookies(httpResp.Cookies()) body, _ := ioutil.ReadAll(httpResp.Body) bodyStr := string(body) defer httpResp.Body.Close() return p, bodyStr }
func (this *QueueScheduler) Push(req *page.Request) { //TODO this.lock.Lock() defer this.lock.Unlock() _, ok := this.rmKeys[req.GetUrl()] if ok { return } this.rmKeys[req.GetUrl()] = req.GetUrlTag() this.queue.PushBack(req) }