/* 2016-01-07 创建爬虫项目,一切从这个开始,首选需要你添加爬虫的各种选项参数,包括用哪种下载器,哪种调度器,哪种资源管理器,哪种pipeline,及页面处理器 当然,我们也为你准备了一系列写好的类,给你进行参考和使用,你可以到对应的文件夹中去寻找 */ func NewSpider(options SpiderOptions) *Spider { //开启日志 mlog.StraceInst().Open() sp := &Spider{ taskname: options.TaskName, pageProcesser: options.PageProcesser, downloader: options.Downloader, scheduler: options.Scheduler, pipelines: options.Pipelines, rm: options.ResourceManage, } // init filelog. sp.CloseFileLog() sp.sleeptype = "fixed" sp.startSleeptime = 0 if sp.taskname == "" { sp.taskname = "robot" } if sp.scheduler == nil { log.Fatal("Please choose the need to use the Scheduler.") } if sp.downloader == nil { log.Fatal("Please choose the need to use the Downloader") } if sp.pageProcesser == nil { log.Fatal("Please choose the need to use the PageProcesser") } if sp.rm == nil { log.Fatal("Please choose the need to use the ResourceManage") } mlog.StraceInst().Println(sp.taskname + " " + "start") return sp }
func (self *Spider) Run() { //不断向爬虫池添加任务 go func() { for { req := self.scheduler.Poll() // rm is not atomic if self.rm.Has() == 0 && req == nil && self.exitWhenComplete { self.pageProcesser.Finish() mlog.StraceInst().Println("Grab complete !!!") break } else if req == nil { time.Sleep(500 * time.Millisecond) //mlog.StraceInst().Println("scheduler is empty") continue } self.rm.AddTask(func(req *Request) { mlog.StraceInst().Println("start crawl : " + req.GetUrl() + " urls:" + strconv.Itoa(self.scheduler.Count())) self.pageProcess(req) }, req) } //关闭爬虫 self.Close() //释放爬虫池 self.rm.Free() }() //爬虫池开始执行任务 self.rm.Start() }
func (self *Spider) Close() { //self.SetScheduler(NewQueueScheduler(false)) //self.SetDownloader(NewHttpDownloader()) self.pipelines = make([]Pipeline, 0) self.exitWhenComplete = true mlog.StraceInst().Println("stop crawl") }
// The CloseStrace close strace. func (self *Spider) CloseStrace() *Spider { mlog.StraceInst().Close() return self }
// The OpenStrace open strace that output progress info on the screen. // Spider's default strace is opened. func (self *Spider) OpenStrace() *Spider { mlog.StraceInst().Open() return self }