func (worker *Worker) Run() { worker.validate() util.Info("'%s' started\n", worker.project) for { task := worker.PopLocal() if task == nil { task = worker.Pop() } if task != nil { worker.retryCount = 0 if taskset, ok := worker.tasksets[task.TaskSet]; !ok { util.Error("can't find taskset %s\n", task.TaskSet) continue } else { data := taskset.Fetch(task) if data != nil { taskset.Parse(task, data) } } } else { worker.retryCount += 1 if worker.retryCount >= worker.retryMax { break } util.Info("sleep 1 second, ( %d | %d )\n", worker.retryCount, worker.retryMax) time.Sleep(time.Second) } } util.Info("'%s' exit\n", worker.project) }
func TumblrParser(taskset *nandu.TaskSet, task *common.Task, bytes []byte) { resp := new(TumblrResponse) err := json.Unmarshal(bytes, resp) if err != nil { util.Error("failed to parse response %s\n", err.Error()) return } d := TaskTumblrData{} task.GetData(&d) if d.Sleep != 0 { time.Sleep(time.Duration(d.Sleep) * time.Millisecond) } if d.Bid == 0 { blog := getTumblrBlog(d.Name, taskset.GetDB()) d.Bid = int64(blog.ID) } util.Info("fetching %s\n", task.Url) begin := int64(resp.Data.Blog.Posts) - d.Offset end := begin - int64(len(resp.Data.Posts)) + 1 ibegin, iend := d.Update(begin, end) for i := ibegin; i < iend; i++ { post := resp.Data.Posts[i] post.TumblrBlogID = uint(d.Bid) post.Offset = uint(begin - i) for j := range post.TumblrPhotos { post.TumblrPhotos[j].Fill() url := post.TumblrPhotos[j].Orig.Url if fn, err := getFileName(url); err == nil { util.Info("yield %s %s (%d | %d)\n", url, fn, resp.Data.Blog.Posts, begin-i) } } taskset.GetDB().Create(&post) } if d.HasMore() { new_task := new(common.Task) new_task.Project = task.Project new_task.TaskSet = task.TaskSet d.Offset = int64(resp.Data.Blog.Posts) - d.Current + 1 new_task.SetData(d) new_task.Url = genUrlFromInterval(&d) taskset.GetWorker().Push(new_task) } }
func AddUserTasks(db *gorm.DB) { t := time.Now() users := make([]AwUser, 0) db.Find(&users) for i := range users { user := users[i] user.Validate(t, db) if user.Group >= kAwGroupFree { util.Info("pushing user %s (%d) into queue\n", user.Name, user.Group) } } }
func main() { util.SetDebug(util.DebugInfo) worker := nandu.NewWorker() info, err := NewTaskPushInfo(kTaskPushInfoFile) if err != nil { util.Fatal("%s\n", err.Error()) } for i := range info.Blogs { task := &info.Blogs[i] d := TaskTumblrData{} task.GetData(&d) d.Min = getStop(d.Name, worker.GetDB(kDatabaseName)) task.SetData(d) util.Info("%s", task.PushLog()) worker.Push(task) } photos := getPhotos(info.FileDataRange.Start, info.FileDataRange.Stop, worker.GetDB(kDatabaseName)) if photos != nil { for i := range photos { task := common.Task{} if photos[i].FileDataID != 0 { continue } task.Url = photos[i].Url task.TaskSet = kDownloadTaskSetName task.SetData(DownloadData{photos[i].ID}) util.Info("push %s\n", task.Url) worker.Push(&task) } } }
func DownloadParser(taskset *nandu.TaskSet, task *common.Task, bs []byte) { if gDownloadInfo == nil { var err error gDownloadInfo, err = NewDownloadInfo(kDownloadInfoFile) if err != nil { util.Fatal("%s\n", err.Error()) } } h := md5.New() h.Write(bs) hash := fmt.Sprintf("%x", h.Sum(nil)) data := DownloadData{} task.GetData(&data) if dup, fid := duplicate(taskset.GetDB(), hash); dup { setFileId(taskset.GetDB(), data.Pid, fid) return } iv, bs := encrypt(bs, gDownloadInfo.EncryptKey) fileData := FileData{} fileData.FileName = fmt.Sprintf("%x%s", iv[:4], hash[:24]) fileData.Hash = hash err := gDownloadInfo.Bucket.PutObject(fileData.FileName, bytes.NewReader(bs)) if err != nil { util.Error("can't create file %s\n", err.Error()) } else { util.Info("downloading %s\n", task.Url) taskset.GetDB().Create(&fileData) setFileId(taskset.GetDB(), data.Pid, fileData.ID) } }