// Spider is scheduler module for all the other modules, like downloader, pipeline, scheduler and etc. // The taskname could be empty string too, or it can be used in Pipeline for record the result crawled by which task; func NewSpider(pageinst page_processer.PageProcesser, taskname string) *Spider { mlog.StraceInst().Open() ap := &Spider{taskname: taskname, pPageProcesser: pageinst} // init filelog. ap.CloseFileLog() ap.exitWhenComplete = true ap.sleeptype = "fixed" ap.startSleeptime = 0 // init spider if ap.pScheduler == nil { ap.SetScheduler(scheduler.NewQueueScheduler(false)) } if ap.pDownloader == nil { ap.SetDownloader(downloader.NewHttpDownloader()) } mlog.StraceInst().Println("** start spider **") ap.pPiplelines = make([]pipeline.Pipeline, 0) return ap }
func (this *Spider) Run() { if this.threadnum == 0 { this.threadnum = 1 } this.mc = resource_manage.NewResourceManageChan(this.threadnum) //init db by sorawa for { req := this.pScheduler.Poll() // mc is not atomic if this.mc.Has() == 0 && req == nil && this.exitWhenComplete { mlog.StraceInst().Println("** executed callback **") this.pPageProcesser.Finish() mlog.StraceInst().Println("** end spider **") break } else if req == nil { time.Sleep(500 * time.Millisecond) //mlog.StraceInst().Println("scheduler is empty") continue } this.mc.GetOne() // Asynchronous fetching go func(req *request.Request) { defer this.mc.FreeOne() //time.Sleep( time.Duration(rand.Intn(5)) * time.Second) mlog.StraceInst().Println("start crawl : " + req.GetUrl()) this.pageProcess(req) }(req) } this.close() }
// The CloseStrace close strace. func (this *Spider) CloseStrace() *Spider { mlog.StraceInst().Close() return this }
// The OpenStrace open strace that output progress info on the screen. // Spider's default strace is opened. func (this *Spider) OpenStrace() *Spider { mlog.StraceInst().Open() return this }