示例#1
0
/*
	设置任务状态
*/
func (this *TaskDao) SetTasksStatus(tasks []types.CrawlTask, status TaskStatus) (int64, error) {
	nTasks := len(tasks)
	var affectedRows int64 = 0
	var err = ErrNoTasks
	if nTasks > 0 {
		taskIds := []int32{}
		var result sql.Result
		var args []interface{}
		var sqlStr string
		var now = time.Now()

		for _, task := range tasks {
			taskIds = append(taskIds, task.Id)
		}

		if status == TASK_FINISH {
			sqlStr = fmt.Sprintf("update %s set status=%d, last_crawl_time=%d, crawl_times=crawl_times+1, update_time='%s' where id in (?)", TaskTable, status, now.Unix(), now.Format("2006-01-02 15:04:05"))
		} else {
			sqlStr = fmt.Sprintf("update %s set status=%d, update_time='%s' where id in (?)", TaskTable, status, now.Format("2006-01-02 15:04:05"))
		}

		sqlStr, args, err = sqlx.In(sqlStr, taskIds)
		if err != nil {
			log.Errorln("build sql to set task status failed! sql is ", sqlStr)
		} else {
			result, err = this.db.Exec(sqlStr, args...)
			if err != nil {
				log.Errorln("update tasks status error: ", err)
			}

		}
		affectedRows, _ = result.RowsAffected()
	}
	return affectedRows, err
}
示例#2
0
//分发task给Fetcher
func (this *Scheduler) DispatchTasks() {
	//获取等待任务
	tasks, err := this.FetchTasks()
	if err != nil {
		log.Errorln("[DispatchTasks] fetch tasks error: ", err)
		return
	}
	if len(tasks) == 0 {
		log.Warnln("[DispatchTasks] no wait tasks yet.")
		return
	}
	//排序
	sorter := lib.CrawlTaskSorter{Now: time.Now().Unix()}
	sorter.Sort(tasks, nil)
	//post到fetchers
	picked := map[int32]bool{}
	httpClient := lib.HttpClient{}
	for _, fetcher := range this.fetchers {
		taskPacks := []types.TaskPack{}
		//挑选未分配的,且符合礼貌原则的任务
		for _, task := range tasks {
			_, ok := picked[task.Id]
			if !ok && this.politeVisitor.IsPolite(task.Domain, fetcher) {
				taskPacks = append(taskPacks, types.TaskPack{TaskId: task.Id, Domain: task.Domain, Urlpath: task.Urlpath})
				picked[task.Id] = true
				//缓存最后访问时间,实际有误差,但是实现简单
				this.politeVisitor.SetLastVisitTime(task.Domain, fetcher, time.Now().Unix())
			}
		}
		jsonBytes, err := json.Marshal(taskPacks)
		if err != nil {
			log.Errorln("make task packs error: ", err)
		} else {
			param := url.Values{}
			param.Add("tasks", string(jsonBytes))
			result, err := httpClient.Post("http://"+fetcher+this.fetcherApi["push_tasks"], param)
			if err != nil {
				log.Errorln("post task packs to fetcher:", fetcher, ", error:", err, " data:", string(jsonBytes))
			} else {
				jsonResult := types.JsonResult{}
				err = json.Unmarshal(result, &jsonResult)
				if err != nil {
					log.Errorln("json unmarshal error:", err, " data:", string(result))
				} else {
					log.Infoln("get push tasks response: ", jsonResult)
				}
			}
		}
	}
}
示例#3
0
/*
	从规则库导入任务
*/
func (this *Scheduler) AddTasksFromRules() {

	crawlRules, err := this.taskDao.GetWaitRules()
	nRules := len(crawlRules)

	if err != nil {
		log.Errorln("get wait rules error: ", err)
	}

	if nRules > 0 {
		crawlTasks := []types.CrawlTask{}
		for _, rule := range crawlRules {
			crawlTasks = append(crawlTasks, this.taskDao.ConvertRuleToTask(rule))
		}

		log.Infoln("get tasks from rules: ", len(crawlTasks))

		num, results, _ := this.taskDao.AddNewTasks(crawlTasks)

		log.Infoln("add new tasks: ", num)

		affectedRows, _ := this.taskDao.UpdateRules(crawlRules, results)

		log.Infoln("update rules: ", affectedRows)
	} else {
		log.Infoln("no waiting rules yet.")
	}
}
示例#4
0
func InitLocalPageStore(dir string) *LocalPageStore {
	err := os.MkdirAll(dir, os.ModePerm)
	if err != nil {
		log.Errorln("LocalPageStore mkdir ", dir, " error: ", err)
		return nil
	}
	return &LocalPageStore{dir: dir}
}
示例#5
0
/*
	获取等待调度的任务
*/
func (this *Scheduler) FetchTasks() ([]types.CrawlTask, error) {

	waitingTasks, err := this.fetchTaskFromDb()
	if err != nil {
		log.Errorln("fetch tasks error: ", err)
	}
	return waitingTasks, err
}
示例#6
0
/*
	从规则库读取下一批需调度的规则
*/
func (this *TaskDao) GetWaitRules() ([]types.CrawlRule, error) {
	crawlRules := []types.CrawlRule{}
	sqlStr := fmt.Sprintf("select * from %s where status = 0", RuleTable)
	err := this.db.Select(&crawlRules, sqlStr)
	if err != nil {
		log.Errorln(err)
	}
	return crawlRules, err
}
示例#7
0
func (this *Fetcher) fetchPage(pageStore PageStore) {
	defer this.wg.Done()

	httpClient := lib.HttpClient{}
loop:
	for {
		select {
		case taskPack := <-this.taskQueue:
			destUrl := taskPack.Domain + taskPack.Urlpath
			html, err := httpClient.Get(destUrl)
			log.Debugln("goto fetch ", destUrl)
			done := 0
			if err == nil {
				//report success to scheduler, make a log, save html
				html, err = httpClient.IconvHtml(html, "utf-8")
				done = 1
				log.Infoln("fetch '" + destUrl + "' done.")
				err = pageStore.Save(taskPack.Domain, taskPack.Urlpath, string(html))
				if err != nil {
					log.Errorln("fetcher save ", taskPack.Domain, taskPack.Urlpath, " error:", err)
				}
			} else {
				//report fail to scheduler
				log.Errorln("fetch '"+destUrl+"' failed!", err)
			}
			//向scheduler报告任务完成情况
			reportUrl := fmt.Sprintf("http://%s%s?task_id=%d&done=%d", this.scheduler_addr, this.scheduler_api["report"], taskPack.TaskId, done)
			res, err := httpClient.Get(reportUrl)
			if err != nil {
				log.Errorln("report ", reportUrl, " failed!")
			} else {
				result := types.JsonResult{}
				err = json.Unmarshal(res, &result)
				if err != nil || result.Err != 0 {
					log.Errorln("report ", reportUrl, ", get error response: ", string(res))
				}
			}
		case <-this.quitChan:
			//this.quitChan should be closed somewhere
			log.Infoln("quit fetch page...")
			break loop
		}
	}
}
示例#8
0
func (this *LocalPageStore) Save(domain string, urlpath string, page string) error {
	md5Bytes := md5.Sum([]byte(urlpath))
	domain = this.canonicalDomain(domain)
	destDir := filepath.Join(this.dir, domain, fmt.Sprintf("%x", md5Bytes))
	log.Debugln("destDir: ", destDir)
	err := os.MkdirAll(destDir, os.ModePerm)
	if err != nil {
		log.Errorln("mkdir for '"+domain+"/"+urlpath+"' error: ", err)
	} else {
		now := time.Now().Unix()
		fname := fmt.Sprintf("%s/%d", destDir, now)
		err = ioutil.WriteFile(fname, []byte(page), 0666)
		if err != nil {
			log.Errorln("save page: "+domain+"/"+urlpath+" error:", err)
		}
	}

	return err
}
示例#9
0
//hset hostname domain ts
func (this *PoliteVisitor) SetLastVisitTime(domain string, hostname string, ts int64) error {
	client, err := this.pool.Get()
	defer this.pool.Put(client)
	if err != nil {
		log.Errorln("get redis client error: ", err)
	} else {
		host := this.canonicalHostname(hostname)
		dm := this.canonicalDomain(domain)
		key := this.makeRedisKey(host)
		var n int64
		resp := client.Cmd("hset", key, dm, ts)
		n, err = resp.Int64()
		if err != nil {
			log.Errorln("hset ", key, " ", dm, " ", ts, " error: ", err)
		} else {
			log.Debugln("hset ", key, " ", dm, " ", ts, " updated: ", n)
		}
	}
	return err
}
示例#10
0
/*
	选取status为0, 或status=2且调度时间已到的任务
*/
func (this *TaskDao) GetWaitingTasks() ([]types.CrawlTask, error) {
	crawlTasks := []types.CrawlTask{}

	sqlStr := fmt.Sprintf("select * from %s where status=%d or (status=%d and cycle+last_crawl_time <= %d)", TaskTable, TASK_WAITING, TASK_FINISH, time.Now().Unix())

	err := this.db.Select(&crawlTasks, sqlStr)
	if err != nil {
		log.Errorln(err)
	}
	return crawlTasks, err
}
示例#11
0
/*
	根据任务添加结果,修改rule的状态
*/
func (this *TaskDao) UpdateRules(rules []types.CrawlRule, taskAddedResults []sql.Result) (int64, error) {
	if len(taskAddedResults) == 0 {
		log.Errorln("no tasks added so no need to update rules!")
		return 0, ErrNoTasks
	}
	if len(rules) != len(taskAddedResults) {
		log.Errorln("rules number not equal to added tasks number!")
		return 0, ErrNotEqual
	}

	ruleIds := []int32{}
	var insertId int64
	for i, result := range taskAddedResults {
		insertId, _ = result.LastInsertId()
		if insertId > 0 {
			ruleIds = append(ruleIds, rules[i].Id)
		}
	}
	sqlTmp := fmt.Sprintf("update %s set status=%d, update_time='%s' where id in (?)", RuleTable, RULE_ADDED, time.Now().Format("2006-01-02 15:04:05"))

	sqlStr, args, err := sqlx.In(sqlTmp, ruleIds)
	if err != nil {
		log.Errorln("make in sql error: ", err)
		return 0, err
	}

	var result sql.Result
	var affectedRows int64
	result, err = this.db.Exec(sqlStr, args...)

	if err != nil {
		log.Errorln("update rules error: ", err)
		return 0, err
	}
	affectedRows, _ = result.RowsAffected()
	return affectedRows, nil
}
示例#12
0
func (this *TaskDao) AddNewTasks(tasks []types.CrawlTask) (int64, []sql.Result, error) {
	tx, err := this.db.Beginx()
	if err != nil {
		log.Errorln("begin transaction error:", err)
		return 0, nil, err
	}

	results := make([]sql.Result, len(tasks))
	var affectedRows int64 = 0
	sqlStr := fmt.Sprintf("insert into %s (domain, urlpath, priority, cycle, status, last_crawl_time, crawl_times, create_time, update_time) values (:domain, :urlpath, :priority, :cycle, :status, :last_crawl_time, :crawl_times, :create_time, :update_time) on duplicate key update priority=values(priority), cycle=values(cycle), update_time=values(update_time) ", TaskTable)
	for i, task := range tasks {
		result, err1 := tx.NamedExec(sqlStr, task)
		results[i] = result
		if err1 != nil {
			err = err1
			log.Errorln("transaction error:", err, " data:", task)
		} else {
			affectedRows++
		}

	}
	err = tx.Commit()
	return affectedRows, results, err
}
示例#13
0
func (this *Scheduler) reportTaskHandler(w http.ResponseWriter, req *http.Request) {
	requiredParams := map[string]string{"task_id": "int", "done": "int"}
	_, err := utils.CheckHttpParams(req, requiredParams)
	result := types.JsonResult{}
	if err != nil {
		log.Errorln(err)
		result.Err = ErrInputError
		result.Msg = err.Error()
		utils.OutputJsonResult(w, result)
		return
	}

	taskId := req.Form.Get("task_id")
	intTaskId, _ := strconv.Atoi(taskId)
	task := types.CrawlTask{Id: int32(intTaskId)}
	tasks := []types.CrawlTask{task}
	done := req.Form.Get("done")
	var status dao.TaskStatus
	if done == "1" {
		status = dao.TASK_FINISH
	} else {
		status = dao.TASK_FAILED
	}
	_, err = this.taskDao.SetTasksStatus(tasks, status)
	if err != nil {
		msg := fmt.Sprintf("set task %s status to %d, error: %v", taskId, status, err)
		log.Errorln(msg)
		result.Err = ErrDbError
		result.Msg = msg

	} else {
		log.Errorln("set task ", taskId, " status to ", status, " finished.")
		result.Err = ErrOk
	}
	utils.OutputJsonResult(w, result)
}
示例#14
0
func (this *Fetcher) pushTasksHandler(w http.ResponseWriter, req *http.Request) {
	log.Debugln("get request: ", req.RemoteAddr, req.URL)
	req.ParseForm()
	tasksJson := req.Form.Get("tasks")
	taskPacks := []types.TaskPack{}
	var err error = nil
	var result = types.JsonResult{}
	if tasksJson != "" {
		err = json.Unmarshal([]byte(tasksJson), &taskPacks)
		if err != nil {
			msg := "Unmarshal task packs error: " + err.Error()
			log.Errorln(msg)
			result.Err = ErrDataError
			result.Msg = msg
		} else {
			//添加任务到队列
			//最多只允许执行1秒钟
			timerChan := time.After(1 * time.Second)
			cnt := 0
			for _, pack := range taskPacks {
				select {
				case this.taskQueue <- pack:
					cnt++
				case <-timerChan:
					break
				}
			}
			result.Err = ErrOk
			result.Data = taskPacks[:cnt] //将成功进入队列的任务返回
		}
	} else {
		msg := "missing `tasks` key or has no content in the POST request."
		log.Warnln(msg)
		result.Err = ErrInputError
		result.Msg = msg
	}
	utils.OutputJsonResult(w, result)
}
示例#15
0
func InitScheduler(db *sqlx.DB, config map[string]string) *Scheduler {
	taskDao := dao.InitTaskDao(db)
	seconds, _ := strconv.Atoi(config["fetch_rules_period"])
	fetchRulesPeriod := time.Duration(seconds) * time.Second
	seconds, _ = strconv.Atoi(config["fetch_tasks_period"])
	fetchTasksPeriod := time.Duration(seconds) * time.Second
	listenAddr := config["listen_addr"]
	fetchers := strings.Split(strings.Replace(config["fetchers"], " ", "", -1), ",")
	fetcherApi := map[string]string{}
	json.Unmarshal([]byte(config["fetcher_api"]), &fetcherApi)
	minHostVisitInterval, _ := strconv.Atoi(config["min_host_visit_interval"])
	redisAddr := config["redis_addr"]
	redisPoolSize, _ := strconv.Atoi(config["redis_pool_size"])
	redisHeartbeat, _ := strconv.Atoi(config["redis_heartbeat"])

	pool, err := pool.New("tcp", redisAddr, redisPoolSize)
	if err != nil {
		log.Errorln("init redis pool error: ", err)
		return nil
	}
	politeVisitor := InitPoliteVisitor(pool, int64(minHostVisitInterval))

	quitChan := make(chan bool, 1)

	return &Scheduler{
		fetchRulesPeriod: fetchRulesPeriod,
		fetchTasksPeriod: fetchTasksPeriod,
		listenAddr:       listenAddr,
		db:               db,
		taskDao:          taskDao,
		fetchers:         fetchers,
		fetcherApi:       fetcherApi,
		politeVisitor:    politeVisitor,
		redisPool:        pool,
		redisPoolSize:    redisPoolSize,
		redisHeartbeat:   redisHeartbeat,
		quitChan:         quitChan}
}
示例#16
0
//hget hostname domain
func (this *PoliteVisitor) GetLastVisitTime(domain string, hostname string) int64 {
	client, err := this.pool.Get()
	defer this.pool.Put(client)
	var ret int64 = -1
	if err != nil {
		log.Errorln("get redis client error: ", err)
	} else {
		host := this.canonicalHostname(hostname)
		dm := this.canonicalDomain(domain)
		key := this.makeRedisKey(host)
		resp := client.Cmd("hget", key, dm)
		if !resp.IsType(redis.Nil) {
			ts, err := resp.Int64()
			if err != nil {
				log.Debugln("convert redis response to int64 error: ", err, " resp:", resp)
			} else {
				ret = ts
			}
		} else {
			log.Debugln("hget ", key, "->", dm, " return nil")
		}
	}
	return ret
}