/* 从规则库导入任务 */ func (this *Scheduler) AddTasksFromRules() { crawlRules, err := this.taskDao.GetWaitRules() nRules := len(crawlRules) if err != nil { log.Errorln("get wait rules error: ", err) } if nRules > 0 { crawlTasks := []types.CrawlTask{} for _, rule := range crawlRules { crawlTasks = append(crawlTasks, this.taskDao.ConvertRuleToTask(rule)) } log.Infoln("get tasks from rules: ", len(crawlTasks)) num, results, _ := this.taskDao.AddNewTasks(crawlTasks) log.Infoln("add new tasks: ", num) affectedRows, _ := this.taskDao.UpdateRules(crawlRules, results) log.Infoln("update rules: ", affectedRows) } else { log.Infoln("no waiting rules yet.") } }
func main() { var ( cfgFile string role string ) flag.StringVar(&cfgFile, "c", "./conf/cfg.ini", "config file") flag.StringVar(&role, "r", "", "server role: scheduler, fetcher etc.") flag.Parse() config, err := utils.ReadConfig(cfgFile) if err == nil { if role == "scheduler" { db, _ := sqlx.Connect("mysql", config["scheduler"]["dsn"]) log.Infoln("db stats: ", db.Stats()) scheduler := scheduler.InitScheduler(db, config["scheduler"]) scheduler.Run() } else if role == "fetcher" { fetcher := fetcher.InitFetcher(config["fetcher"]) fetcher.Run() } else { fmt.Println("unknown role:", role) } } }
func test(config map[string]map[string]string) { db, _ := sqlx.Connect("mysql", config["scheduler"]["dsn"]) log.Infoln("db stats: ", db.Stats()) taskDao := dao.InitTaskDao(db) crawlRules, _ := taskDao.GetWaitRules() crawlTasks := []types.CrawlTask{} for _, rule := range crawlRules { log.Infoln(rule.Domain, rule.Urlpath) crawlTasks = append(crawlTasks, taskDao.ConvertRuleToTask(rule)) } log.Infoln("get task: ", len(crawlTasks)) num, results, err := taskDao.AddNewTasks(crawlTasks) log.Infoln("add task: ", num, results, err) affectedRows, _ := taskDao.UpdateRules(crawlRules, results) log.Infoln("update rules: ", affectedRows) waitingTasks, _ := taskDao.GetWaitingTasks() for _, task := range waitingTasks { log.Infoln(task.Domain, task.Urlpath) } affectedRows, _ = taskDao.SetTasksStatus(waitingTasks, dao.TASK_FINISH) log.Infoln("set task status: ", affectedRows) }
func (this *Fetcher) fetchPage(pageStore PageStore) { defer this.wg.Done() httpClient := lib.HttpClient{} loop: for { select { case taskPack := <-this.taskQueue: destUrl := taskPack.Domain + taskPack.Urlpath html, err := httpClient.Get(destUrl) log.Debugln("goto fetch ", destUrl) done := 0 if err == nil { //report success to scheduler, make a log, save html html, err = httpClient.IconvHtml(html, "utf-8") done = 1 log.Infoln("fetch '" + destUrl + "' done.") err = pageStore.Save(taskPack.Domain, taskPack.Urlpath, string(html)) if err != nil { log.Errorln("fetcher save ", taskPack.Domain, taskPack.Urlpath, " error:", err) } } else { //report fail to scheduler log.Errorln("fetch '"+destUrl+"' failed!", err) } //向scheduler报告任务完成情况 reportUrl := fmt.Sprintf("http://%s%s?task_id=%d&done=%d", this.scheduler_addr, this.scheduler_api["report"], taskPack.TaskId, done) res, err := httpClient.Get(reportUrl) if err != nil { log.Errorln("report ", reportUrl, " failed!") } else { result := types.JsonResult{} err = json.Unmarshal(res, &result) if err != nil || result.Err != 0 { log.Errorln("report ", reportUrl, ", get error response: ", string(res)) } } case <-this.quitChan: //this.quitChan should be closed somewhere log.Infoln("quit fetch page...") break loop } } }
//分发task给Fetcher func (this *Scheduler) DispatchTasks() { //获取等待任务 tasks, err := this.FetchTasks() if err != nil { log.Errorln("[DispatchTasks] fetch tasks error: ", err) return } if len(tasks) == 0 { log.Warnln("[DispatchTasks] no wait tasks yet.") return } //排序 sorter := lib.CrawlTaskSorter{Now: time.Now().Unix()} sorter.Sort(tasks, nil) //post到fetchers picked := map[int32]bool{} httpClient := lib.HttpClient{} for _, fetcher := range this.fetchers { taskPacks := []types.TaskPack{} //挑选未分配的,且符合礼貌原则的任务 for _, task := range tasks { _, ok := picked[task.Id] if !ok && this.politeVisitor.IsPolite(task.Domain, fetcher) { taskPacks = append(taskPacks, types.TaskPack{TaskId: task.Id, Domain: task.Domain, Urlpath: task.Urlpath}) picked[task.Id] = true //缓存最后访问时间,实际有误差,但是实现简单 this.politeVisitor.SetLastVisitTime(task.Domain, fetcher, time.Now().Unix()) } } jsonBytes, err := json.Marshal(taskPacks) if err != nil { log.Errorln("make task packs error: ", err) } else { param := url.Values{} param.Add("tasks", string(jsonBytes)) result, err := httpClient.Post("http://"+fetcher+this.fetcherApi["push_tasks"], param) if err != nil { log.Errorln("post task packs to fetcher:", fetcher, ", error:", err, " data:", string(jsonBytes)) } else { jsonResult := types.JsonResult{} err = json.Unmarshal(result, &jsonResult) if err != nil { log.Errorln("json unmarshal error:", err, " data:", string(result)) } else { log.Infoln("get push tasks response: ", jsonResult) } } } } }
//启动Fetcher func (this *Fetcher) Run() { //启动api server go this.httpService() for i := 0; i < this.nWorkers; i++ { this.wg.Add(1) go this.fetchPage(this.pageStore) } log.Infoln("start ", this.nWorkers, " fetch workers...") go utils.HandleQuitSignal(func() { close(this.quitChan) }) this.wg.Wait() }