Пример #1
0
/*
2016-01-07
创建爬虫项目,一切从这个开始,首选需要你添加爬虫的各种选项参数,包括用哪种下载器,哪种调度器,哪种资源管理器,哪种pipeline,及页面处理器
当然,我们也为你准备了一系列写好的类,给你进行参考和使用,你可以到对应的文件夹中去寻找
*/
func NewSpider(options SpiderOptions) *Spider {
	//开启日志
	mlog.StraceInst().Open()
	sp := &Spider{
		taskname:      options.TaskName,
		pageProcesser: options.PageProcesser,
		downloader:    options.Downloader,
		scheduler:     options.Scheduler,
		pipelines:     options.Pipelines,
		rm:            options.ResourceManage,
	}
	// init filelog.
	sp.CloseFileLog()
	sp.sleeptype = "fixed"
	sp.startSleeptime = 0

	if sp.taskname == "" {
		sp.taskname = "robot"
	}
	if sp.scheduler == nil {
		log.Fatal("Please choose the need to use the Scheduler.")
	}
	if sp.downloader == nil {
		log.Fatal("Please choose the need to use the Downloader")
	}
	if sp.pageProcesser == nil {
		log.Fatal("Please choose the need to use the PageProcesser")
	}
	if sp.rm == nil {
		log.Fatal("Please choose the need to use the ResourceManage")
	}
	mlog.StraceInst().Println(sp.taskname + " " + "start")
	return sp
}
Пример #2
0
func (self *Spider) Run() {
	//不断向爬虫池添加任务
	go func() {
		for {
			req := self.scheduler.Poll()

			// rm is not atomic
			if self.rm.Has() == 0 && req == nil && self.exitWhenComplete {
				self.pageProcesser.Finish()
				mlog.StraceInst().Println("Grab complete !!!")
				break
			} else if req == nil {
				time.Sleep(500 * time.Millisecond)
				//mlog.StraceInst().Println("scheduler is empty")
				continue
			}
			self.rm.AddTask(func(req *Request) {
				mlog.StraceInst().Println("start crawl : " + req.GetUrl() + " urls:" + strconv.Itoa(self.scheduler.Count()))
				self.pageProcess(req)
			}, req)
		}
		//关闭爬虫
		self.Close()
		//释放爬虫池
		self.rm.Free()
	}()
	//爬虫池开始执行任务
	self.rm.Start()
}
Пример #3
0
func (self *Spider) Close() {
	//self.SetScheduler(NewQueueScheduler(false))
	//self.SetDownloader(NewHttpDownloader())
	self.pipelines = make([]Pipeline, 0)
	self.exitWhenComplete = true
	mlog.StraceInst().Println("stop crawl")
}
Пример #4
0
// The CloseStrace close strace.
func (self *Spider) CloseStrace() *Spider {
	mlog.StraceInst().Close()
	return self
}
Пример #5
0
// The OpenStrace open strace that output progress info on the screen.
// Spider's default strace is opened.
func (self *Spider) OpenStrace() *Spider {
	mlog.StraceInst().Open()
	return self
}