コード例 #1
0
ファイル: fetcher.go プロジェクト: zhaozhi406/crawler
func initPageStore(config map[string]string) PageStore {
	localDir := config["local_dir"]
	weedfsMaster := config["weedfs_master"]
	if localDir != "" {
		return &LocalPageStore{dir: localDir}
	} else if weedfsMaster != "" {
		return nil
	}
	log.Warnln("does not specify local dir or weedfs master! save pages to ./html_pages/")
	return &LocalPageStore{dir: "./html_pages"}
}
コード例 #2
0
ファイル: scheduler.go プロジェクト: zhaozhi406/crawler
//分发task给Fetcher
func (this *Scheduler) DispatchTasks() {
	//获取等待任务
	tasks, err := this.FetchTasks()
	if err != nil {
		log.Errorln("[DispatchTasks] fetch tasks error: ", err)
		return
	}
	if len(tasks) == 0 {
		log.Warnln("[DispatchTasks] no wait tasks yet.")
		return
	}
	//排序
	sorter := lib.CrawlTaskSorter{Now: time.Now().Unix()}
	sorter.Sort(tasks, nil)
	//post到fetchers
	picked := map[int32]bool{}
	httpClient := lib.HttpClient{}
	for _, fetcher := range this.fetchers {
		taskPacks := []types.TaskPack{}
		//挑选未分配的,且符合礼貌原则的任务
		for _, task := range tasks {
			_, ok := picked[task.Id]
			if !ok && this.politeVisitor.IsPolite(task.Domain, fetcher) {
				taskPacks = append(taskPacks, types.TaskPack{TaskId: task.Id, Domain: task.Domain, Urlpath: task.Urlpath})
				picked[task.Id] = true
				//缓存最后访问时间,实际有误差,但是实现简单
				this.politeVisitor.SetLastVisitTime(task.Domain, fetcher, time.Now().Unix())
			}
		}
		jsonBytes, err := json.Marshal(taskPacks)
		if err != nil {
			log.Errorln("make task packs error: ", err)
		} else {
			param := url.Values{}
			param.Add("tasks", string(jsonBytes))
			result, err := httpClient.Post("http://"+fetcher+this.fetcherApi["push_tasks"], param)
			if err != nil {
				log.Errorln("post task packs to fetcher:", fetcher, ", error:", err, " data:", string(jsonBytes))
			} else {
				jsonResult := types.JsonResult{}
				err = json.Unmarshal(result, &jsonResult)
				if err != nil {
					log.Errorln("json unmarshal error:", err, " data:", string(result))
				} else {
					log.Infoln("get push tasks response: ", jsonResult)
				}
			}
		}
	}
}
コード例 #3
0
ファイル: fetcher.go プロジェクト: zhaozhi406/crawler
func (this *Fetcher) pushTasksHandler(w http.ResponseWriter, req *http.Request) {
	log.Debugln("get request: ", req.RemoteAddr, req.URL)
	req.ParseForm()
	tasksJson := req.Form.Get("tasks")
	taskPacks := []types.TaskPack{}
	var err error = nil
	var result = types.JsonResult{}
	if tasksJson != "" {
		err = json.Unmarshal([]byte(tasksJson), &taskPacks)
		if err != nil {
			msg := "Unmarshal task packs error: " + err.Error()
			log.Errorln(msg)
			result.Err = ErrDataError
			result.Msg = msg
		} else {
			//添加任务到队列
			//最多只允许执行1秒钟
			timerChan := time.After(1 * time.Second)
			cnt := 0
			for _, pack := range taskPacks {
				select {
				case this.taskQueue <- pack:
					cnt++
				case <-timerChan:
					break
				}
			}
			result.Err = ErrOk
			result.Data = taskPacks[:cnt] //将成功进入队列的任务返回
		}
	} else {
		msg := "missing `tasks` key or has no content in the POST request."
		log.Warnln(msg)
		result.Err = ErrInputError
		result.Msg = msg
	}
	utils.OutputJsonResult(w, result)
}