Beispiel #1
0
//自定义函数
func New(taskname string) *Spider {
	mlog.StraceInst().Open()

	ap := &Spider{taskname: taskname}

	// init filelog.
	ap.CloseFileLog()
	ap.exitWhenComplete = true
	ap.sleeptype = "fixed"
	ap.startSleeptime = 0

	// init spider
	if ap.pScheduler == nil {
		ap.SetScheduler(scheduler.NewQueueScheduler(false))
	}

	if ap.pDownloader == nil {
		ap.SetDownloader(downloader.NewHttpDownloader())
	}

	mlog.StraceInst().Println("** start spider **")
	ap.pPiplelines = make([]pipeline.Pipeline, 0)

	return ap
}
Beispiel #2
0
func (this *Spider) Run() {
	if this.threadnum == 0 {
		this.threadnum = 1
	}
	this.mc = resource_manage.NewResourceManageChan(this.threadnum)

	//init db  by sorawa
	print("outout")
	for {
		req := this.pScheduler.Poll()
		// mc is not atomic
		if this.mc.Has() == 0 && req == nil && this.exitWhenComplete {
			mlog.StraceInst().Println("** executed callback **")
			this.pPageProcesser.Finish()
			mlog.StraceInst().Println("** end spider **")
			break
		} else if req == nil {
			time.Sleep(500 * time.Millisecond)
			//mlog.StraceInst().Println("scheduler is empty")
			continue
		}
		this.mc.GetOne()

		// Asynchronous fetching
		go func(req *request.Request) {
			defer this.mc.FreeOne()
			//time.Sleep( time.Duration(rand.Intn(5)) * time.Second)
			mlog.StraceInst().Println("start crawl : " + req.GetUrl())
			this.pageProcess(req)
		}(req)
	}
	this.close()
}
Beispiel #3
0
// The CloseStrace close strace.
func (this *Spider) CloseStrace() *Spider {
	mlog.StraceInst().Close()
	return this
}
Beispiel #4
0
// The OpenStrace open strace that output progress info on the screen.
// Spider's default strace is opened.
func (this *Spider) OpenStrace() *Spider {
	mlog.StraceInst().Open()
	return this
}