func (this *Spider) AddUrlsEx(urls []string, respType string, headerFile string, proxyHost string) *Spider {
	for _, url := range urls {
		req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil)
		this.AddRequest(req.AddHeaderFile(headerFile).AddProxyHost(proxyHost))
	}
	return this
}
func (this *Spider) AddUrls(urls []string, respType string) *Spider {
	for _, url := range urls {
		req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil)
		this.AddRequest(req)
	}
	return this
}
func TestQueueScheduler(t *testing.T) {
	var r *request.Request
	r = request.NewRequest("http://baidu.com", "html", "", "GET", "", nil, nil, nil, nil)
	fmt.Printf("%v\n", r)

	var s *scheduler.QueueScheduler
	s = scheduler.NewQueueScheduler(false)

	s.Push(r)
	var count int = s.Count()
	if count != 1 {
		t.Error("count error")
	}
	fmt.Println(count)

	var r1 *request.Request
	r1 = s.Poll()
	if r1 == nil {
		t.Error("poll error")
	}
	fmt.Printf("%v\n", r1)

	// remove duplicate
	s = scheduler.NewQueueScheduler(true)

	r2 := request.NewRequest("http://qq.com", "html", "", "GET", "", nil, nil, nil, nil)
	s.Push(r)
	s.Push(r2)
	s.Push(r)
	count = s.Count()
	if count != 2 {
		t.Error("count error")
	}
	fmt.Println(count)

	r1 = s.Poll()
	if r1 == nil {
		t.Error("poll error")
	}
	fmt.Printf("%v\n", r1)
	r1 = s.Poll()
	if r1 == nil {
		t.Error("poll error")
	}
	fmt.Printf("%v\n", r1)
}
// Deal with several urls and return the PageItems slice.
func (this *Spider) GetAll(urls []string, respType string) []*page_items.PageItems {
	for _, u := range urls {
		req := request.NewRequest(u, respType, "", "GET", "", nil, nil, nil, nil)
		this.AddRequest(req)
	}

	pip := pipeline.NewCollectPipelinePageItems()
	this.AddPipeline(pip)

	this.Run()

	return pip.GetCollected()
}
Exemple #5
0
//自定义函数
func (this *Page) AddMyTargetRequest(url, respType, urltag, resqType, postdata, proxy, heardefile, cookie string) *Page {
	req := request.NewRequest(url, respType, urltag, resqType, postdata, nil, nil, nil, nil)

	if cookie == "" {
		print(cookie)
	}
	if heardefile != "" {
		req.AddHeaderFile(heardefile)
	}
	if proxy != "" {
		ip := util.GetIp()
		req.AddProxyHost(ip)
	}
	this.targetRequests = append(this.targetRequests, req)
	return this
}
func (this *Spider) AddMyUrl(url, respType, urltag, resqType, postdata, proxy, heardefile, cookie string) *Spider {
	req := request.NewRequest(url, respType, urltag, resqType, postdata, nil, nil, nil, nil)

	if cookie != "" {
		print(cookie)
	}
	if heardefile != "" {
		req.AddHeaderFile(heardefile)
	}
	if proxy != "" {
		ip := util.GetIp()
		req.AddProxyHost(ip)
	}
	this.AddRequest(req)
	return this
}
func TestDownloadHtml(t *testing.T) {
	//return
	//request := request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&callback=t13975294&id=23521&pagesize=45&dire=f&dpc=1")
	var req *request.Request
	req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html", "", "GET", "", nil, nil, nil, nil)

	var dl downloader.Downloader
	dl = downloader.NewHttpDownloader()

	var p *page.Page
	p = dl.Download(req)

	var doc *goquery.Document
	doc = p.GetHtmlParser()
	//fmt.Println(doc)
	//body := p.GetBodyStr()
	//fmt.Println(body)

	var s *goquery.Selection
	s = doc.Find("body")
	if s.Length() < 1 {
		t.Error("html parse failed!")
	}

	/*
	   doc, err := goquery.NewDocument("http://live.sina.com.cn/zt/l/v/finance/globalnews1/")
	   if err != nil {
	       fmt.Printf("%v",err)
	   }
	   s := doc.Find("meta");
	   fmt.Println(s.Length())

	   resp, err := http.Get("http://live.sina.com.cn/zt/l/v/finance/globalnews1/")
	   if err != nil {
	       fmt.Printf("%v",err)
	   }
	   defer resp.Body.Close()
	   doc, err = goquery.NewDocumentFromReader(resp.Body)
	   s = doc.Find("meta");
	   fmt.Println(s.Length())
	*/
}
func TestCharSetChange(t *testing.T) {
	var req *request.Request
	//req = request.NewRequest("http://stock.finance.sina.com.cn/usstock/api/jsonp.php/t/US_CategoryService.getList?page=1&num=60", "jsonp")
	req = request.NewRequest("http://soft.chinabyte.com/416/13164916.shtml", "html", "", "GET", "", nil, nil, nil, nil)

	var dl downloader.Downloader
	dl = downloader.NewHttpDownloader()

	var p *page.Page
	p = dl.Download(req)

	//hp := p.GetHtmlParser()
	//fmt.Printf("%v", jsonMap)

	//fmt.Println(doc)
	p.GetBodyStr()
	body := p.GetBodyStr()
	fmt.Println(body)

}
func TestDownloadJson(t *testing.T) {
	//return
	var req *request.Request
	req = request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=23521&pagesize=4&dire=f&dpc=1", "json", "", "GET", "", nil, nil, nil, nil)

	var dl downloader.Downloader
	dl = downloader.NewHttpDownloader()

	var p *page.Page
	p = dl.Download(req)

	var jsonMap interface{}
	jsonMap = p.GetJson()
	fmt.Printf("%v", jsonMap)

	//fmt.Println(doc)
	//body := p.GetBodyStr()
	//fmt.Println(body)

}
Exemple #10
0
// AddTargetRequest adds one new Request waitting for crawl.
func (this *Page) AddTargetRequest(url string, respType string) *Page {
	this.targetRequests = append(this.targetRequests, request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil))
	return this
}
Exemple #11
0
func (this *Spider) AddUrl(url string, respType string) *Spider {
	req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil)
	this.AddRequest(req)
	return this
}
Exemple #12
0
// Deal with one url and return the PageItems.
func (this *Spider) Get(url string, respType string) *page_items.PageItems {
	req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil)
	return this.GetByRequest(req)
}