/* 设置任务状态 */ func (this *TaskDao) SetTasksStatus(tasks []types.CrawlTask, status TaskStatus) (int64, error) { nTasks := len(tasks) var affectedRows int64 = 0 var err = ErrNoTasks if nTasks > 0 { taskIds := []int32{} var result sql.Result var args []interface{} var sqlStr string var now = time.Now() for _, task := range tasks { taskIds = append(taskIds, task.Id) } if status == TASK_FINISH { sqlStr = fmt.Sprintf("update %s set status=%d, last_crawl_time=%d, crawl_times=crawl_times+1, update_time='%s' where id in (?)", TaskTable, status, now.Unix(), now.Format("2006-01-02 15:04:05")) } else { sqlStr = fmt.Sprintf("update %s set status=%d, update_time='%s' where id in (?)", TaskTable, status, now.Format("2006-01-02 15:04:05")) } sqlStr, args, err = sqlx.In(sqlStr, taskIds) if err != nil { log.Errorln("build sql to set task status failed! sql is ", sqlStr) } else { result, err = this.db.Exec(sqlStr, args...) if err != nil { log.Errorln("update tasks status error: ", err) } } affectedRows, _ = result.RowsAffected() } return affectedRows, err }
//分发task给Fetcher func (this *Scheduler) DispatchTasks() { //获取等待任务 tasks, err := this.FetchTasks() if err != nil { log.Errorln("[DispatchTasks] fetch tasks error: ", err) return } if len(tasks) == 0 { log.Warnln("[DispatchTasks] no wait tasks yet.") return } //排序 sorter := lib.CrawlTaskSorter{Now: time.Now().Unix()} sorter.Sort(tasks, nil) //post到fetchers picked := map[int32]bool{} httpClient := lib.HttpClient{} for _, fetcher := range this.fetchers { taskPacks := []types.TaskPack{} //挑选未分配的,且符合礼貌原则的任务 for _, task := range tasks { _, ok := picked[task.Id] if !ok && this.politeVisitor.IsPolite(task.Domain, fetcher) { taskPacks = append(taskPacks, types.TaskPack{TaskId: task.Id, Domain: task.Domain, Urlpath: task.Urlpath}) picked[task.Id] = true //缓存最后访问时间,实际有误差,但是实现简单 this.politeVisitor.SetLastVisitTime(task.Domain, fetcher, time.Now().Unix()) } } jsonBytes, err := json.Marshal(taskPacks) if err != nil { log.Errorln("make task packs error: ", err) } else { param := url.Values{} param.Add("tasks", string(jsonBytes)) result, err := httpClient.Post("http://"+fetcher+this.fetcherApi["push_tasks"], param) if err != nil { log.Errorln("post task packs to fetcher:", fetcher, ", error:", err, " data:", string(jsonBytes)) } else { jsonResult := types.JsonResult{} err = json.Unmarshal(result, &jsonResult) if err != nil { log.Errorln("json unmarshal error:", err, " data:", string(result)) } else { log.Infoln("get push tasks response: ", jsonResult) } } } } }
/* 从规则库导入任务 */ func (this *Scheduler) AddTasksFromRules() { crawlRules, err := this.taskDao.GetWaitRules() nRules := len(crawlRules) if err != nil { log.Errorln("get wait rules error: ", err) } if nRules > 0 { crawlTasks := []types.CrawlTask{} for _, rule := range crawlRules { crawlTasks = append(crawlTasks, this.taskDao.ConvertRuleToTask(rule)) } log.Infoln("get tasks from rules: ", len(crawlTasks)) num, results, _ := this.taskDao.AddNewTasks(crawlTasks) log.Infoln("add new tasks: ", num) affectedRows, _ := this.taskDao.UpdateRules(crawlRules, results) log.Infoln("update rules: ", affectedRows) } else { log.Infoln("no waiting rules yet.") } }
func InitLocalPageStore(dir string) *LocalPageStore { err := os.MkdirAll(dir, os.ModePerm) if err != nil { log.Errorln("LocalPageStore mkdir ", dir, " error: ", err) return nil } return &LocalPageStore{dir: dir} }
/* 获取等待调度的任务 */ func (this *Scheduler) FetchTasks() ([]types.CrawlTask, error) { waitingTasks, err := this.fetchTaskFromDb() if err != nil { log.Errorln("fetch tasks error: ", err) } return waitingTasks, err }
/* 从规则库读取下一批需调度的规则 */ func (this *TaskDao) GetWaitRules() ([]types.CrawlRule, error) { crawlRules := []types.CrawlRule{} sqlStr := fmt.Sprintf("select * from %s where status = 0", RuleTable) err := this.db.Select(&crawlRules, sqlStr) if err != nil { log.Errorln(err) } return crawlRules, err }
func (this *Fetcher) fetchPage(pageStore PageStore) { defer this.wg.Done() httpClient := lib.HttpClient{} loop: for { select { case taskPack := <-this.taskQueue: destUrl := taskPack.Domain + taskPack.Urlpath html, err := httpClient.Get(destUrl) log.Debugln("goto fetch ", destUrl) done := 0 if err == nil { //report success to scheduler, make a log, save html html, err = httpClient.IconvHtml(html, "utf-8") done = 1 log.Infoln("fetch '" + destUrl + "' done.") err = pageStore.Save(taskPack.Domain, taskPack.Urlpath, string(html)) if err != nil { log.Errorln("fetcher save ", taskPack.Domain, taskPack.Urlpath, " error:", err) } } else { //report fail to scheduler log.Errorln("fetch '"+destUrl+"' failed!", err) } //向scheduler报告任务完成情况 reportUrl := fmt.Sprintf("http://%s%s?task_id=%d&done=%d", this.scheduler_addr, this.scheduler_api["report"], taskPack.TaskId, done) res, err := httpClient.Get(reportUrl) if err != nil { log.Errorln("report ", reportUrl, " failed!") } else { result := types.JsonResult{} err = json.Unmarshal(res, &result) if err != nil || result.Err != 0 { log.Errorln("report ", reportUrl, ", get error response: ", string(res)) } } case <-this.quitChan: //this.quitChan should be closed somewhere log.Infoln("quit fetch page...") break loop } } }
func (this *LocalPageStore) Save(domain string, urlpath string, page string) error { md5Bytes := md5.Sum([]byte(urlpath)) domain = this.canonicalDomain(domain) destDir := filepath.Join(this.dir, domain, fmt.Sprintf("%x", md5Bytes)) log.Debugln("destDir: ", destDir) err := os.MkdirAll(destDir, os.ModePerm) if err != nil { log.Errorln("mkdir for '"+domain+"/"+urlpath+"' error: ", err) } else { now := time.Now().Unix() fname := fmt.Sprintf("%s/%d", destDir, now) err = ioutil.WriteFile(fname, []byte(page), 0666) if err != nil { log.Errorln("save page: "+domain+"/"+urlpath+" error:", err) } } return err }
//hset hostname domain ts func (this *PoliteVisitor) SetLastVisitTime(domain string, hostname string, ts int64) error { client, err := this.pool.Get() defer this.pool.Put(client) if err != nil { log.Errorln("get redis client error: ", err) } else { host := this.canonicalHostname(hostname) dm := this.canonicalDomain(domain) key := this.makeRedisKey(host) var n int64 resp := client.Cmd("hset", key, dm, ts) n, err = resp.Int64() if err != nil { log.Errorln("hset ", key, " ", dm, " ", ts, " error: ", err) } else { log.Debugln("hset ", key, " ", dm, " ", ts, " updated: ", n) } } return err }
/* 选取status为0, 或status=2且调度时间已到的任务 */ func (this *TaskDao) GetWaitingTasks() ([]types.CrawlTask, error) { crawlTasks := []types.CrawlTask{} sqlStr := fmt.Sprintf("select * from %s where status=%d or (status=%d and cycle+last_crawl_time <= %d)", TaskTable, TASK_WAITING, TASK_FINISH, time.Now().Unix()) err := this.db.Select(&crawlTasks, sqlStr) if err != nil { log.Errorln(err) } return crawlTasks, err }
/* 根据任务添加结果,修改rule的状态 */ func (this *TaskDao) UpdateRules(rules []types.CrawlRule, taskAddedResults []sql.Result) (int64, error) { if len(taskAddedResults) == 0 { log.Errorln("no tasks added so no need to update rules!") return 0, ErrNoTasks } if len(rules) != len(taskAddedResults) { log.Errorln("rules number not equal to added tasks number!") return 0, ErrNotEqual } ruleIds := []int32{} var insertId int64 for i, result := range taskAddedResults { insertId, _ = result.LastInsertId() if insertId > 0 { ruleIds = append(ruleIds, rules[i].Id) } } sqlTmp := fmt.Sprintf("update %s set status=%d, update_time='%s' where id in (?)", RuleTable, RULE_ADDED, time.Now().Format("2006-01-02 15:04:05")) sqlStr, args, err := sqlx.In(sqlTmp, ruleIds) if err != nil { log.Errorln("make in sql error: ", err) return 0, err } var result sql.Result var affectedRows int64 result, err = this.db.Exec(sqlStr, args...) if err != nil { log.Errorln("update rules error: ", err) return 0, err } affectedRows, _ = result.RowsAffected() return affectedRows, nil }
func (this *TaskDao) AddNewTasks(tasks []types.CrawlTask) (int64, []sql.Result, error) { tx, err := this.db.Beginx() if err != nil { log.Errorln("begin transaction error:", err) return 0, nil, err } results := make([]sql.Result, len(tasks)) var affectedRows int64 = 0 sqlStr := fmt.Sprintf("insert into %s (domain, urlpath, priority, cycle, status, last_crawl_time, crawl_times, create_time, update_time) values (:domain, :urlpath, :priority, :cycle, :status, :last_crawl_time, :crawl_times, :create_time, :update_time) on duplicate key update priority=values(priority), cycle=values(cycle), update_time=values(update_time) ", TaskTable) for i, task := range tasks { result, err1 := tx.NamedExec(sqlStr, task) results[i] = result if err1 != nil { err = err1 log.Errorln("transaction error:", err, " data:", task) } else { affectedRows++ } } err = tx.Commit() return affectedRows, results, err }
func (this *Scheduler) reportTaskHandler(w http.ResponseWriter, req *http.Request) { requiredParams := map[string]string{"task_id": "int", "done": "int"} _, err := utils.CheckHttpParams(req, requiredParams) result := types.JsonResult{} if err != nil { log.Errorln(err) result.Err = ErrInputError result.Msg = err.Error() utils.OutputJsonResult(w, result) return } taskId := req.Form.Get("task_id") intTaskId, _ := strconv.Atoi(taskId) task := types.CrawlTask{Id: int32(intTaskId)} tasks := []types.CrawlTask{task} done := req.Form.Get("done") var status dao.TaskStatus if done == "1" { status = dao.TASK_FINISH } else { status = dao.TASK_FAILED } _, err = this.taskDao.SetTasksStatus(tasks, status) if err != nil { msg := fmt.Sprintf("set task %s status to %d, error: %v", taskId, status, err) log.Errorln(msg) result.Err = ErrDbError result.Msg = msg } else { log.Errorln("set task ", taskId, " status to ", status, " finished.") result.Err = ErrOk } utils.OutputJsonResult(w, result) }
func (this *Fetcher) pushTasksHandler(w http.ResponseWriter, req *http.Request) { log.Debugln("get request: ", req.RemoteAddr, req.URL) req.ParseForm() tasksJson := req.Form.Get("tasks") taskPacks := []types.TaskPack{} var err error = nil var result = types.JsonResult{} if tasksJson != "" { err = json.Unmarshal([]byte(tasksJson), &taskPacks) if err != nil { msg := "Unmarshal task packs error: " + err.Error() log.Errorln(msg) result.Err = ErrDataError result.Msg = msg } else { //添加任务到队列 //最多只允许执行1秒钟 timerChan := time.After(1 * time.Second) cnt := 0 for _, pack := range taskPacks { select { case this.taskQueue <- pack: cnt++ case <-timerChan: break } } result.Err = ErrOk result.Data = taskPacks[:cnt] //将成功进入队列的任务返回 } } else { msg := "missing `tasks` key or has no content in the POST request." log.Warnln(msg) result.Err = ErrInputError result.Msg = msg } utils.OutputJsonResult(w, result) }
func InitScheduler(db *sqlx.DB, config map[string]string) *Scheduler { taskDao := dao.InitTaskDao(db) seconds, _ := strconv.Atoi(config["fetch_rules_period"]) fetchRulesPeriod := time.Duration(seconds) * time.Second seconds, _ = strconv.Atoi(config["fetch_tasks_period"]) fetchTasksPeriod := time.Duration(seconds) * time.Second listenAddr := config["listen_addr"] fetchers := strings.Split(strings.Replace(config["fetchers"], " ", "", -1), ",") fetcherApi := map[string]string{} json.Unmarshal([]byte(config["fetcher_api"]), &fetcherApi) minHostVisitInterval, _ := strconv.Atoi(config["min_host_visit_interval"]) redisAddr := config["redis_addr"] redisPoolSize, _ := strconv.Atoi(config["redis_pool_size"]) redisHeartbeat, _ := strconv.Atoi(config["redis_heartbeat"]) pool, err := pool.New("tcp", redisAddr, redisPoolSize) if err != nil { log.Errorln("init redis pool error: ", err) return nil } politeVisitor := InitPoliteVisitor(pool, int64(minHostVisitInterval)) quitChan := make(chan bool, 1) return &Scheduler{ fetchRulesPeriod: fetchRulesPeriod, fetchTasksPeriod: fetchTasksPeriod, listenAddr: listenAddr, db: db, taskDao: taskDao, fetchers: fetchers, fetcherApi: fetcherApi, politeVisitor: politeVisitor, redisPool: pool, redisPoolSize: redisPoolSize, redisHeartbeat: redisHeartbeat, quitChan: quitChan} }
//hget hostname domain func (this *PoliteVisitor) GetLastVisitTime(domain string, hostname string) int64 { client, err := this.pool.Get() defer this.pool.Put(client) var ret int64 = -1 if err != nil { log.Errorln("get redis client error: ", err) } else { host := this.canonicalHostname(hostname) dm := this.canonicalDomain(domain) key := this.makeRedisKey(host) resp := client.Cmd("hget", key, dm) if !resp.IsType(redis.Nil) { ts, err := resp.Int64() if err != nil { log.Debugln("convert redis response to int64 error: ", err, " resp:", resp) } else { ret = ts } } else { log.Debugln("hget ", key, "->", dm, " return nil") } } return ret }