func initPageStore(config map[string]string) PageStore { localDir := config["local_dir"] weedfsMaster := config["weedfs_master"] if localDir != "" { return &LocalPageStore{dir: localDir} } else if weedfsMaster != "" { return nil } log.Warnln("does not specify local dir or weedfs master! save pages to ./html_pages/") return &LocalPageStore{dir: "./html_pages"} }
//分发task给Fetcher func (this *Scheduler) DispatchTasks() { //获取等待任务 tasks, err := this.FetchTasks() if err != nil { log.Errorln("[DispatchTasks] fetch tasks error: ", err) return } if len(tasks) == 0 { log.Warnln("[DispatchTasks] no wait tasks yet.") return } //排序 sorter := lib.CrawlTaskSorter{Now: time.Now().Unix()} sorter.Sort(tasks, nil) //post到fetchers picked := map[int32]bool{} httpClient := lib.HttpClient{} for _, fetcher := range this.fetchers { taskPacks := []types.TaskPack{} //挑选未分配的,且符合礼貌原则的任务 for _, task := range tasks { _, ok := picked[task.Id] if !ok && this.politeVisitor.IsPolite(task.Domain, fetcher) { taskPacks = append(taskPacks, types.TaskPack{TaskId: task.Id, Domain: task.Domain, Urlpath: task.Urlpath}) picked[task.Id] = true //缓存最后访问时间,实际有误差,但是实现简单 this.politeVisitor.SetLastVisitTime(task.Domain, fetcher, time.Now().Unix()) } } jsonBytes, err := json.Marshal(taskPacks) if err != nil { log.Errorln("make task packs error: ", err) } else { param := url.Values{} param.Add("tasks", string(jsonBytes)) result, err := httpClient.Post("http://"+fetcher+this.fetcherApi["push_tasks"], param) if err != nil { log.Errorln("post task packs to fetcher:", fetcher, ", error:", err, " data:", string(jsonBytes)) } else { jsonResult := types.JsonResult{} err = json.Unmarshal(result, &jsonResult) if err != nil { log.Errorln("json unmarshal error:", err, " data:", string(result)) } else { log.Infoln("get push tasks response: ", jsonResult) } } } } }
func (this *Fetcher) pushTasksHandler(w http.ResponseWriter, req *http.Request) { log.Debugln("get request: ", req.RemoteAddr, req.URL) req.ParseForm() tasksJson := req.Form.Get("tasks") taskPacks := []types.TaskPack{} var err error = nil var result = types.JsonResult{} if tasksJson != "" { err = json.Unmarshal([]byte(tasksJson), &taskPacks) if err != nil { msg := "Unmarshal task packs error: " + err.Error() log.Errorln(msg) result.Err = ErrDataError result.Msg = msg } else { //添加任务到队列 //最多只允许执行1秒钟 timerChan := time.After(1 * time.Second) cnt := 0 for _, pack := range taskPacks { select { case this.taskQueue <- pack: cnt++ case <-timerChan: break } } result.Err = ErrOk result.Data = taskPacks[:cnt] //将成功进入队列的任务返回 } } else { msg := "missing `tasks` key or has no content in the POST request." log.Warnln(msg) result.Err = ErrInputError result.Msg = msg } utils.OutputJsonResult(w, result) }