func (this *HttpDownloader) downloadHtml(p *page.Page, req *request.Request) *page.Page {
	var err error
	p, destbody := this.downloadFile(p, req)
	//fmt.Printf("Destbody %v \r\n", destbody)
	if !p.IsSucc() {
		//fmt.Print("Page error \r\n")
		return p
	}
	bodyReader := bytes.NewReader([]byte(destbody))

	var doc *goquery.Document
	if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil {
		mlog.LogInst().LogError(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	var body string
	if body, err = doc.Html(); err != nil {
		mlog.LogInst().LogError(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "")

	return p
}
func (this *HttpDownloader) downloadJson(p *page.Page, req *request.Request) *page.Page {
	var err error
	p, destbody := this.downloadFile(p, req)
	if !p.IsSucc() {
		return p
	}

	var body []byte
	body = []byte(destbody)
	mtype := req.GetResponceType()
	if mtype == "jsonp" {
		tmpstr := util.JsonpToJson(destbody)
		body = []byte(tmpstr)
	}

	var r *simplejson.Json
	if r, err = simplejson.NewJson(body); err != nil {
		mlog.LogInst().LogError(string(body) + "\t" + err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	// json result
	p.SetBodyStr(string(body)).SetJson(r).SetStatus(false, "")

	return p
}
func (this *PageProcesserHtml) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	result := map[string]string{}
	for k, _ := range this.rule {
		result[k] = ""
	}

	query := p.GetHtmlParser()
	var urls []string
	query.Find(this.page["rule"]).Each(func(i int, s *goquery.Selection) {
		href := ""
		if this.page["fun"] == "text" {
			href = s.Text()
		} else {
			href, _ = s.Attr(this.page["fun"])
		}

		urls = append(urls, this.page["pre"]+href)
	})
	p.AddMyTargetRequests(urls, this.conf["texttype"], "", this.conf["resqType"], this.conf["postdata"], this.conf["proxy"], this.conf["heardefile"], this.conf["cookie"])
	for k, v := range this.rule {
		if this.num[k] == "ALL" {
			var items []string
			query.Find(v).Each(func(i int, s *goquery.Selection) {
				item := ""
				if this.fun[k] == "text" {
					item = s.Text()
				} else {
					item, _ = s.Attr(this.fun[k])
				}
				items = append(items, item)
			})
			result[k] = strings.Join(items, "|")
		} else {
			if this.fun[k] == "text" {
				result[k] = query.Find(v).Text()
			} else {
				result[k], _ = query.Find(v).Attr(this.fun[k])
			}
			result[k] = strings.Trim(result[k], " \t\n")
		}

		if result[k] == "" {
			p.SetSkip(true)
		}
		p.AddField(k, result[k])

	}
	for k, v := range p.GetPageItems().GetAll() {
		println(k, v)
	}

}
func (this *HttpDownloader) downloadText(p *page.Page, req *request.Request) *page.Page {
	p, destbody := this.downloadFile(p, req)
	if !p.IsSucc() {
		return p
	}

	p.SetBodyStr(destbody).SetStatus(false, "")
	return p
}
Beispiel #5
0
// core processer
func (this *Spider) pageProcess(req *request.Request) {
	var p *page.Page

	defer func() {
		if err := recover(); err != nil { // do not affect other
			if strerr, ok := err.(string); ok {
				mlog.LogInst().LogError(strerr)
			} else {
				mlog.LogInst().LogError("pageProcess error")
			}
		}
	}()

	// download page
	for i := 0; i < 3; i++ {
		this.sleep()
		p = this.pDownloader.Download(req)
		if p.IsSucc() { // if fail retry 3 times
			break
		}

	}

	if !p.IsSucc() { // if fail do not need process
		return
	}

	this.pPageProcesser.Process(p)
	for _, req := range p.GetTargetRequests() {
		this.AddRequest(req)
	}

	// output
	if !p.GetSkip() {
		for _, pip := range this.pPiplelines {
			//fmt.Println("%v",p.GetPageItems().GetAll())
			pip.Process(p.GetPageItems(), this)
		}
	}
}