func (this *Spider) pageProcess(req *page.Request) { var p *page.Page //下载页面 for i := 0; i < 3; i++ { p = this.m_downLoader.DownLoad(req) if p.IsSucc() { break } time.Sleep(time.Microsecond * 1000) } if !p.IsSucc() { this.finishForReqProcesser(req.GetUrl()) return } //分析页面内容 this.m_pageProcesser.Process(p) //获取新的链接 if p.CountNewUrls() > 0 { newUrls := p.GetNewUrls() for tmpUrl, tmpUrlTag := range newUrls { this.AddUrl(tmpUrl, "html", tmpUrlTag) } } this.finishForReqProcesser(req.GetUrl()) //输出 for _, tmpOut := range this.m_outputs { tmpOut.Process(p.GetPageItemsList(), p.GetRequest().GetUrl()) } }
func (this *HttpDownLoader) downloadHtml(p *page.Page, req *page.Request) *page.Page { p, destBody := this.downloadFile(p, req) if !p.IsSucc() { return p } p.SetBody(destBody) return p }
func (this *YouxiduoProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.GetErrMsg()) return } var body string = p.GetBody() var urlTag string = p.GetRequest().GetUrlTag() p.SetUrlTag(urlTag) //分析这个页面是LIST页面还是内容页面 // <div class="infroList"><ul><li>...</div>===>LIST // <div class="pagebreak">...</div>===>LIST // CONTENT //<div class="article" if urlTag == "list" { // //1.寻找news-brief的content regList, err := regexp.Compile(`<div class=\"infroList\">(\s|.)*<\/ul>(\s|.)*<div class=\"pagebreak\">`) if err != nil { logs.GetFirstLogger().Error("分析页面出错,正则表达式错误了,url = " + p.GetRequest().GetUrl()) } var infroList []string = regList.FindAllString(body, -1) if len(infroList) > 0 { this.parseNewsBreifInfo(infroList[0], p) } else { logs.GetFirstLogger().Info("No more list items") } //先寻找额外的LIST页面 if !p.IsBreak() { regPageBreak, err := regexp.Compile(`<div class=\"pagebreak\">(\s|.)+<li class=\"lastPage\">`) if err != nil { logs.GetFirstLogger().Error("分析页面出错,翻页正则表达式错误,url = " + p.GetRequest().GetUrl()) } var pageBreakList []string = regPageBreak.FindAllString(body, -1) if len(pageBreakList) > 0 { this.parseNewsLinkListInfo(pageBreakList[0], p) } else { logs.GetFirstLogger().Info("No more links") } } } else { //CONTENT this.parseNewsDetail(body, p) } }