Ejemplo n.º 1
0
/*
	从规则库导入任务
*/
func (this *Scheduler) AddTasksFromRules() {

	crawlRules, err := this.taskDao.GetWaitRules()
	nRules := len(crawlRules)

	if err != nil {
		log.Errorln("get wait rules error: ", err)
	}

	if nRules > 0 {
		crawlTasks := []types.CrawlTask{}
		for _, rule := range crawlRules {
			crawlTasks = append(crawlTasks, this.taskDao.ConvertRuleToTask(rule))
		}

		log.Infoln("get tasks from rules: ", len(crawlTasks))

		num, results, _ := this.taskDao.AddNewTasks(crawlTasks)

		log.Infoln("add new tasks: ", num)

		affectedRows, _ := this.taskDao.UpdateRules(crawlRules, results)

		log.Infoln("update rules: ", affectedRows)
	} else {
		log.Infoln("no waiting rules yet.")
	}
}
Ejemplo n.º 2
0
func main() {
	var (
		cfgFile string
		role    string
	)

	flag.StringVar(&cfgFile, "c", "./conf/cfg.ini", "config file")
	flag.StringVar(&role, "r", "", "server role: scheduler, fetcher etc.")
	flag.Parse()

	config, err := utils.ReadConfig(cfgFile)

	if err == nil {
		if role == "scheduler" {
			db, _ := sqlx.Connect("mysql", config["scheduler"]["dsn"])
			log.Infoln("db stats: ", db.Stats())

			scheduler := scheduler.InitScheduler(db, config["scheduler"])
			scheduler.Run()
		} else if role == "fetcher" {
			fetcher := fetcher.InitFetcher(config["fetcher"])
			fetcher.Run()
		} else {
			fmt.Println("unknown role:", role)
		}
	}

}
Ejemplo n.º 3
0
func test(config map[string]map[string]string) {
	db, _ := sqlx.Connect("mysql", config["scheduler"]["dsn"])
	log.Infoln("db stats: ", db.Stats())

	taskDao := dao.InitTaskDao(db)
	crawlRules, _ := taskDao.GetWaitRules()
	crawlTasks := []types.CrawlTask{}
	for _, rule := range crawlRules {
		log.Infoln(rule.Domain, rule.Urlpath)
		crawlTasks = append(crawlTasks, taskDao.ConvertRuleToTask(rule))
	}

	log.Infoln("get task: ", len(crawlTasks))

	num, results, err := taskDao.AddNewTasks(crawlTasks)

	log.Infoln("add task: ", num, results, err)

	affectedRows, _ := taskDao.UpdateRules(crawlRules, results)

	log.Infoln("update rules: ", affectedRows)

	waitingTasks, _ := taskDao.GetWaitingTasks()

	for _, task := range waitingTasks {
		log.Infoln(task.Domain, task.Urlpath)
	}

	affectedRows, _ = taskDao.SetTasksStatus(waitingTasks, dao.TASK_FINISH)
	log.Infoln("set task status: ", affectedRows)

}
Ejemplo n.º 4
0
func (this *Fetcher) fetchPage(pageStore PageStore) {
	defer this.wg.Done()

	httpClient := lib.HttpClient{}
loop:
	for {
		select {
		case taskPack := <-this.taskQueue:
			destUrl := taskPack.Domain + taskPack.Urlpath
			html, err := httpClient.Get(destUrl)
			log.Debugln("goto fetch ", destUrl)
			done := 0
			if err == nil {
				//report success to scheduler, make a log, save html
				html, err = httpClient.IconvHtml(html, "utf-8")
				done = 1
				log.Infoln("fetch '" + destUrl + "' done.")
				err = pageStore.Save(taskPack.Domain, taskPack.Urlpath, string(html))
				if err != nil {
					log.Errorln("fetcher save ", taskPack.Domain, taskPack.Urlpath, " error:", err)
				}
			} else {
				//report fail to scheduler
				log.Errorln("fetch '"+destUrl+"' failed!", err)
			}
			//向scheduler报告任务完成情况
			reportUrl := fmt.Sprintf("http://%s%s?task_id=%d&done=%d", this.scheduler_addr, this.scheduler_api["report"], taskPack.TaskId, done)
			res, err := httpClient.Get(reportUrl)
			if err != nil {
				log.Errorln("report ", reportUrl, " failed!")
			} else {
				result := types.JsonResult{}
				err = json.Unmarshal(res, &result)
				if err != nil || result.Err != 0 {
					log.Errorln("report ", reportUrl, ", get error response: ", string(res))
				}
			}
		case <-this.quitChan:
			//this.quitChan should be closed somewhere
			log.Infoln("quit fetch page...")
			break loop
		}
	}
}
Ejemplo n.º 5
0
//分发task给Fetcher
func (this *Scheduler) DispatchTasks() {
	//获取等待任务
	tasks, err := this.FetchTasks()
	if err != nil {
		log.Errorln("[DispatchTasks] fetch tasks error: ", err)
		return
	}
	if len(tasks) == 0 {
		log.Warnln("[DispatchTasks] no wait tasks yet.")
		return
	}
	//排序
	sorter := lib.CrawlTaskSorter{Now: time.Now().Unix()}
	sorter.Sort(tasks, nil)
	//post到fetchers
	picked := map[int32]bool{}
	httpClient := lib.HttpClient{}
	for _, fetcher := range this.fetchers {
		taskPacks := []types.TaskPack{}
		//挑选未分配的,且符合礼貌原则的任务
		for _, task := range tasks {
			_, ok := picked[task.Id]
			if !ok && this.politeVisitor.IsPolite(task.Domain, fetcher) {
				taskPacks = append(taskPacks, types.TaskPack{TaskId: task.Id, Domain: task.Domain, Urlpath: task.Urlpath})
				picked[task.Id] = true
				//缓存最后访问时间,实际有误差,但是实现简单
				this.politeVisitor.SetLastVisitTime(task.Domain, fetcher, time.Now().Unix())
			}
		}
		jsonBytes, err := json.Marshal(taskPacks)
		if err != nil {
			log.Errorln("make task packs error: ", err)
		} else {
			param := url.Values{}
			param.Add("tasks", string(jsonBytes))
			result, err := httpClient.Post("http://"+fetcher+this.fetcherApi["push_tasks"], param)
			if err != nil {
				log.Errorln("post task packs to fetcher:", fetcher, ", error:", err, " data:", string(jsonBytes))
			} else {
				jsonResult := types.JsonResult{}
				err = json.Unmarshal(result, &jsonResult)
				if err != nil {
					log.Errorln("json unmarshal error:", err, " data:", string(result))
				} else {
					log.Infoln("get push tasks response: ", jsonResult)
				}
			}
		}
	}
}
Ejemplo n.º 6
0
//启动Fetcher
func (this *Fetcher) Run() {
	//启动api server
	go this.httpService()
	for i := 0; i < this.nWorkers; i++ {
		this.wg.Add(1)
		go this.fetchPage(this.pageStore)
	}
	log.Infoln("start ", this.nWorkers, " fetch workers...")

	go utils.HandleQuitSignal(func() {
		close(this.quitChan)
	})

	this.wg.Wait()
}