//分发task给Fetcher func (this *Scheduler) DispatchTasks() { //获取等待任务 tasks, err := this.FetchTasks() if err != nil { log.Errorln("[DispatchTasks] fetch tasks error: ", err) return } if len(tasks) == 0 { log.Warnln("[DispatchTasks] no wait tasks yet.") return } //排序 sorter := lib.CrawlTaskSorter{Now: time.Now().Unix()} sorter.Sort(tasks, nil) //post到fetchers picked := map[int32]bool{} httpClient := lib.HttpClient{} for _, fetcher := range this.fetchers { taskPacks := []types.TaskPack{} //挑选未分配的,且符合礼貌原则的任务 for _, task := range tasks { _, ok := picked[task.Id] if !ok && this.politeVisitor.IsPolite(task.Domain, fetcher) { taskPacks = append(taskPacks, types.TaskPack{TaskId: task.Id, Domain: task.Domain, Urlpath: task.Urlpath}) picked[task.Id] = true //缓存最后访问时间,实际有误差,但是实现简单 this.politeVisitor.SetLastVisitTime(task.Domain, fetcher, time.Now().Unix()) } } jsonBytes, err := json.Marshal(taskPacks) if err != nil { log.Errorln("make task packs error: ", err) } else { param := url.Values{} param.Add("tasks", string(jsonBytes)) result, err := httpClient.Post("http://"+fetcher+this.fetcherApi["push_tasks"], param) if err != nil { log.Errorln("post task packs to fetcher:", fetcher, ", error:", err, " data:", string(jsonBytes)) } else { jsonResult := types.JsonResult{} err = json.Unmarshal(result, &jsonResult) if err != nil { log.Errorln("json unmarshal error:", err, " data:", string(result)) } else { log.Infoln("get push tasks response: ", jsonResult) } } } } }
func (this *Fetcher) fetchPage(pageStore PageStore) { defer this.wg.Done() httpClient := lib.HttpClient{} loop: for { select { case taskPack := <-this.taskQueue: destUrl := taskPack.Domain + taskPack.Urlpath html, err := httpClient.Get(destUrl) log.Debugln("goto fetch ", destUrl) done := 0 if err == nil { //report success to scheduler, make a log, save html html, err = httpClient.IconvHtml(html, "utf-8") done = 1 log.Infoln("fetch '" + destUrl + "' done.") err = pageStore.Save(taskPack.Domain, taskPack.Urlpath, string(html)) if err != nil { log.Errorln("fetcher save ", taskPack.Domain, taskPack.Urlpath, " error:", err) } } else { //report fail to scheduler log.Errorln("fetch '"+destUrl+"' failed!", err) } //向scheduler报告任务完成情况 reportUrl := fmt.Sprintf("http://%s%s?task_id=%d&done=%d", this.scheduler_addr, this.scheduler_api["report"], taskPack.TaskId, done) res, err := httpClient.Get(reportUrl) if err != nil { log.Errorln("report ", reportUrl, " failed!") } else { result := types.JsonResult{} err = json.Unmarshal(res, &result) if err != nil || result.Err != 0 { log.Errorln("report ", reportUrl, ", get error response: ", string(res)) } } case <-this.quitChan: //this.quitChan should be closed somewhere log.Infoln("quit fetch page...") break loop } } }