Beispiel #1
0
// Spider is scheduler module for all the other modules, like downloader, pipeline, scheduler and etc.
// The taskname could be empty string too, or it can be used in Pipeline for record the result crawled by which task;
func NewSpider(pageinst page_processer.PageProcesser, taskname string) *Spider {
	mlog.StraceInst().Open()

	ap := &Spider{taskname: taskname, pPageProcesser: pageinst}

	// init filelog.
	ap.CloseFileLog()
	ap.exitWhenComplete = true
	ap.sleeptype = "fixed"
	ap.startSleeptime = 0

	// init spider
	if ap.pScheduler == nil {
		ap.SetScheduler(scheduler.NewQueueScheduler(false))
	}

	if ap.pDownloader == nil {
		ap.SetDownloader(downloader.NewHttpDownloader())
	}

	mlog.StraceInst().Println("** start spider **")
	ap.pPiplelines = make([]pipeline.Pipeline, 0)

	return ap
}
Beispiel #2
0
func main() {
	var startUrl = "http://baike.baidu.com/subview/412610/19548276.htm"
	sp := spider.NewSpider(plant.NewPlantProcesser(), "TaskName").
		SetScheduler(scheduler.NewQueueScheduler(true)).
		AddPipeline(pipeline.NewPipelineFile("result.txt"))
	sp.AddUrl(startUrl, "html")
	sp.Run()
}
func TestQueueScheduler(t *testing.T) {
	var r *request.Request
	r = request.NewRequest("http://baidu.com", "html", "", "GET", "", nil, nil, nil, nil)
	fmt.Printf("%v\n", r)

	var s *scheduler.QueueScheduler
	s = scheduler.NewQueueScheduler(false)

	s.Push(r)
	var count int = s.Count()
	if count != 1 {
		t.Error("count error")
	}
	fmt.Println(count)

	var r1 *request.Request
	r1 = s.Poll()
	if r1 == nil {
		t.Error("poll error")
	}
	fmt.Printf("%v\n", r1)

	// remove duplicate
	s = scheduler.NewQueueScheduler(true)

	r2 := request.NewRequest("http://qq.com", "html", "", "GET", "", nil, nil, nil, nil)
	s.Push(r)
	s.Push(r2)
	s.Push(r)
	count = s.Count()
	if count != 2 {
		t.Error("count error")
	}
	fmt.Println(count)

	r1 = s.Poll()
	if r1 == nil {
		t.Error("poll error")
	}
	fmt.Printf("%v\n", r1)
	r1 = s.Poll()
	if r1 == nil {
		t.Error("poll error")
	}
	fmt.Printf("%v\n", r1)
}
Beispiel #4
0
func TestQueueScheduler(t *testing.T) {
	var r *request.Request
	r = request.NewRequest("http://baidu.com", "html")

	var s *scheduler.QueueScheduler
	s = scheduler.NewQueueScheduler()

	s.Push(r)
	var count int = s.Count()
	fmt.Println(count)

	var r1 *request.Request
	r1 = s.Poll()
	fmt.Println(r1)
}
Beispiel #5
0
func (this *Spider) close() {
	this.SetScheduler(scheduler.NewQueueScheduler(false))
	this.SetDownloader(downloader.NewHttpDownloader())
	this.pPiplelines = make([]pipeline.Pipeline, 0)
	this.exitWhenComplete = true
}