Exemple #1
0
func (this *Spider) pageProcess(req *page.Request) {
	var p *page.Page
	//下载页面
	for i := 0; i < 3; i++ {
		p = this.m_downLoader.DownLoad(req)
		if p.IsSucc() {
			break
		}
		time.Sleep(time.Microsecond * 1000)
	}

	if !p.IsSucc() {
		this.finishForReqProcesser(req.GetUrl())
		return
	}

	//分析页面内容
	this.m_pageProcesser.Process(p)

	//获取新的链接
	if p.CountNewUrls() > 0 {
		newUrls := p.GetNewUrls()
		for tmpUrl, tmpUrlTag := range newUrls {
			this.AddUrl(tmpUrl, "html", tmpUrlTag)
		}
	}

	this.finishForReqProcesser(req.GetUrl())

	//输出
	for _, tmpOut := range this.m_outputs {
		tmpOut.Process(p.GetPageItemsList(), p.GetRequest().GetUrl())
	}

}
Exemple #2
0
func (this *Spider) Run() *Spider {
	for {
		var req *page.Request = this.m_scheduler.Poll()

		if req == nil {
			if this.countHandlingUrl() == 0 {
				break
			}

			time.Sleep(500 * time.Millisecond)
			//这里需要判断有没有没有处理完的请求,如果都处理完了可以退出,如果还没有处理完,那么继续等待下一个时间片段
			continue
		}

		//req.GetUrl
		this.waitForReqProcesser(req.GetUrl(), req.GetUrlTag())

		go func(*page.Request) {
			//deal the page,may get new pages
			this.pageProcess(req)
		}(req)

	}

	//运行结束
	return this

}
Exemple #3
0
func (this *Spider) AddRequest(req *page.Request) *Spider {
	if req == nil {
		logs.GetFirstLogger().Error("request is nil")
		return this
	} else if req.GetUrl() == "" {
		logs.GetFirstLogger().Error("request is empty")
	}

	this.m_scheduler.Push(req)
	return this
}
func (this *HttpDownLoader) DownLoad(req *page.Request) *page.Page {
	var p = page.NewPage(req)
	var respType = req.GetRespType()
	switch respType {
	case "html":
		return this.downloadHtml(p, req)
	default:
		logs.GetFirstLogger().Error("error request type : " + respType)
	}

	return nil
}
//下载文件,并对字符编码做相应的处理
func (this *HttpDownLoader) downloadFile(p *page.Page, req *page.Request) (*page.Page, string) {
	var err error
	var httpResp *http.Response
	var urlStr string
	var method string
	urlStr = req.GetUrl()
	if len(urlStr) == 0 {
		logs.GetFirstLogger().Error("url is empty")
		p.SetStatus(true, "url is empty")
		return p, ""
	}

	method = req.GetMethod()

	if method == "POST" {
		httpResp, err = http.Post(req.GetUrl(), "application/x-www-form-urlencoded", strings.NewReader(req.GetPostData()))
	} else {
		httpResp, err = http.Get(req.GetUrl())
	}

	if err != nil {
		logs.GetFirstLogger().Error("http visit error :" + err.Error())
		p.SetStatus(true, err.Error())
	}
	p.SetHeader(httpResp.Header)
	p.SetCookies(httpResp.Cookies())
	body, _ := ioutil.ReadAll(httpResp.Body)
	bodyStr := string(body)
	defer httpResp.Body.Close()
	return p, bodyStr
}
func (this *QueueScheduler) Push(req *page.Request) {
	//TODO
	this.lock.Lock()
	defer this.lock.Unlock()
	_, ok := this.rmKeys[req.GetUrl()]
	if ok {
		return
	}
	this.rmKeys[req.GetUrl()] = req.GetUrlTag()
	this.queue.PushBack(req)
}