// choose http GET/method to download func connectByHttp(p *context.Response, req *context.Request) (*http.Response, error) { client := &http.Client{ CheckRedirect: req.GetRedirectFunc(), } httpreq, err := http.NewRequest(req.GetMethod(), req.GetUrl(), strings.NewReader(req.GetPostdata())) if header := req.GetHeader(); header != nil { httpreq.Header = req.GetHeader() } if cookies := req.GetCookies(); cookies != nil { for i := range cookies { httpreq.AddCookie(cookies[i]) } } var resp *http.Response if resp, err = client.Do(httpreq); err != nil { if e, ok := err.(*url.Error); ok && e.Err != nil && e.Err.Error() == "normal" { // normal } else { reporter.Log.Println(err.Error()) p.SetStatus(true, err.Error()) //fmt.Printf("client do error %v \r\n", err) return nil, err } } return resp, nil }
func (self *HttpDownloader) downloadText(p *context.Response, req *context.Request) *context.Response { p, destbody := self.downloadFile(p, req) if !p.IsSucc() { return p } p.SetBodyStr(destbody).SetStatus(false, "") return p }
func (self *HttpDownloader) downloadHtml(p *context.Response, req *context.Request) *context.Response { var err error p, destbody := self.downloadFile(p, req) //fmt.Printf("Destbody %v \r\n", destbody) if !p.IsSucc() { //fmt.Print("Response error \r\n") return p } bodyReader := bytes.NewReader([]byte(destbody)) var doc *goquery.Document if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil { reporter.Log.Println(err.Error()) p.SetStatus(true, err.Error()) return p } var body string if body, err = doc.Html(); err != nil { reporter.Log.Println(err.Error()) p.SetStatus(true, err.Error()) return p } p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "") return p }
func (self *HttpDownloader) downloadJson(p *context.Response, req *context.Request) *context.Response { var err error p, destbody := self.downloadFile(p, req) if !p.IsSucc() { return p } var body []byte body = []byte(destbody) mtype := req.GetRespType() if mtype == "jsonp" { tmpstr := util.JsonpToJson(destbody) body = []byte(tmpstr) } var r *simplejson.Json if r, err = simplejson.NewJson(body); err != nil { reporter.Log.Println(string(body) + "\t" + err.Error()) p.SetStatus(true, err.Error()) return p } // json result p.SetBodyStr(string(body)).SetJson(r).SetStatus(false, "") return p }
// Download file and change the charset of response charset. func (self *HttpDownloader) downloadFile(p *context.Response, req *context.Request) (*context.Response, string) { var err error var urlstr string if urlstr = req.GetUrl(); len(urlstr) == 0 { reporter.Log.Println("url is empty") p.SetStatus(true, "url is empty") return p, "" } var resp *http.Response if proxystr := req.GetProxyHost(); len(proxystr) != 0 { //using http proxy //fmt.Print("HttpProxy Enter ",proxystr,"\n") resp, err = connectByHttpProxy(p, req) } else { //normal http download //fmt.Print("Http Normal Enter \n",proxystr,"\n") resp, err = connectByHttp(p, req) } if err != nil { return p, "" } //b, _ := ioutil.ReadAll(resp.Body) //fmt.Printf("Resp body %v \r\n", string(b)) p.SetHeader(resp.Header) p.SetCookies(resp.Cookies()) // get converter to utf-8 bodyStr := self.changeCharsetEncodingAuto(resp.Header.Get("Content-Type"), resp.Body) //fmt.Printf("utf-8 body %v \r\n", bodyStr) defer resp.Body.Close() return p, bodyStr }
// core processer func (self *crawler) Process(req *context.Request) { // 声明response var resp *context.Response defer func() { if err := recover(); err != nil { // do not affect other if strerr, ok := err.(string); ok { reporter.Log.Println(strerr) } else { reporter.Log.Println("Process error:", err) } } }() // reporter.Log.Println("**************断点 1 ***********") // download page for i := 0; i < 3; i++ { self.sleep() resp = self.Downloader.Download(req) if resp.IsSucc() { // if fail retry 3 times break } } // reporter.Log.Println("**************断点 2 ***********") if !resp.IsSucc() { // if fail do not need process return } // reporter.Log.Println("**************断点 3 ***********") // 过程处理,提炼数据 self.Spider.GoRule(resp) // reporter.Log.Println("**************断点 5 ***********") // 该条请求结果存入pipeline datas := resp.GetItems() for i, count := 0, len(datas); i < count; i++ { self.Pipeline.Collect( resp.GetRuleName(), //DataCell.RuleName datas[i], //DataCell.Data resp.GetUrl(), //DataCell.Url resp.GetParent(), //DataCell.ParentUrl time.Now().Format("2006-01-02 15:04:05"), ) } // reporter.Log.Println("**************断点 end ***********") }
// 获取任务规则采集语义字段 func (self *Spider) GetOutFeild(resp *context.Response, index int) string { return self.RuleTree.Nodes[resp.GetRuleName()].OutFeild[index] }
// 用指定规则解析响应流 func (self *Spider) CallRule(ruleName string, resp *context.Response) { resp.SetRuleName(ruleName) self.GoRule(resp) }
// 根据响应流运行指定解析规则 func (self *Spider) GoRule(resp *context.Response) { self.RuleTree.Nodes[resp.GetRuleName()].ParseFunc(self, resp) }