// Spider is scheduler module for all the other modules, like downloader, pipeline, scheduler and etc. // The taskname could be empty string too, or it can be used in Pipeline for record the result crawled by which task; func NewSpider(pageinst page_processer.PageProcesser, taskname string) *Spider { mlog.StraceInst().Open() ap := &Spider{taskname: taskname, pPageProcesser: pageinst} // init filelog. ap.CloseFileLog() ap.exitWhenComplete = true ap.sleeptype = "fixed" ap.startSleeptime = 0 // init spider if ap.pScheduler == nil { ap.SetScheduler(scheduler.NewQueueScheduler(false)) } if ap.pDownloader == nil { ap.SetDownloader(downloader.NewHttpDownloader()) } mlog.StraceInst().Println("** start spider **") ap.pPiplelines = make([]pipeline.Pipeline, 0) return ap }
func main() { var startUrl = "http://baike.baidu.com/subview/412610/19548276.htm" sp := spider.NewSpider(plant.NewPlantProcesser(), "TaskName"). SetScheduler(scheduler.NewQueueScheduler(true)). AddPipeline(pipeline.NewPipelineFile("result.txt")) sp.AddUrl(startUrl, "html") sp.Run() }
func TestQueueScheduler(t *testing.T) { var r *request.Request r = request.NewRequest("http://baidu.com", "html", "", "GET", "", nil, nil, nil, nil) fmt.Printf("%v\n", r) var s *scheduler.QueueScheduler s = scheduler.NewQueueScheduler(false) s.Push(r) var count int = s.Count() if count != 1 { t.Error("count error") } fmt.Println(count) var r1 *request.Request r1 = s.Poll() if r1 == nil { t.Error("poll error") } fmt.Printf("%v\n", r1) // remove duplicate s = scheduler.NewQueueScheduler(true) r2 := request.NewRequest("http://qq.com", "html", "", "GET", "", nil, nil, nil, nil) s.Push(r) s.Push(r2) s.Push(r) count = s.Count() if count != 2 { t.Error("count error") } fmt.Println(count) r1 = s.Poll() if r1 == nil { t.Error("poll error") } fmt.Printf("%v\n", r1) r1 = s.Poll() if r1 == nil { t.Error("poll error") } fmt.Printf("%v\n", r1) }
func TestQueueScheduler(t *testing.T) { var r *request.Request r = request.NewRequest("http://baidu.com", "html") var s *scheduler.QueueScheduler s = scheduler.NewQueueScheduler() s.Push(r) var count int = s.Count() fmt.Println(count) var r1 *request.Request r1 = s.Poll() fmt.Println(r1) }
func (this *Spider) close() { this.SetScheduler(scheduler.NewQueueScheduler(false)) this.SetDownloader(downloader.NewHttpDownloader()) this.pPiplelines = make([]pipeline.Pipeline, 0) this.exitWhenComplete = true }