func TumblrParser(taskset *nandu.TaskSet, task *common.Task, bytes []byte) { resp := new(TumblrResponse) err := json.Unmarshal(bytes, resp) if err != nil { util.Error("failed to parse response %s\n", err.Error()) return } d := TaskTumblrData{} task.GetData(&d) if d.Sleep != 0 { time.Sleep(time.Duration(d.Sleep) * time.Millisecond) } if d.Bid == 0 { blog := getTumblrBlog(d.Name, taskset.GetDB()) d.Bid = int64(blog.ID) } util.Info("fetching %s\n", task.Url) begin := int64(resp.Data.Blog.Posts) - d.Offset end := begin - int64(len(resp.Data.Posts)) + 1 ibegin, iend := d.Update(begin, end) for i := ibegin; i < iend; i++ { post := resp.Data.Posts[i] post.TumblrBlogID = uint(d.Bid) post.Offset = uint(begin - i) for j := range post.TumblrPhotos { post.TumblrPhotos[j].Fill() url := post.TumblrPhotos[j].Orig.Url if fn, err := getFileName(url); err == nil { util.Info("yield %s %s (%d | %d)\n", url, fn, resp.Data.Blog.Posts, begin-i) } } taskset.GetDB().Create(&post) } if d.HasMore() { new_task := new(common.Task) new_task.Project = task.Project new_task.TaskSet = task.TaskSet d.Offset = int64(resp.Data.Blog.Posts) - d.Current + 1 new_task.SetData(d) new_task.Url = genUrlFromInterval(&d) taskset.GetWorker().Push(new_task) } }
func DownloadParser(taskset *nandu.TaskSet, task *common.Task, bs []byte) { if gDownloadInfo == nil { var err error gDownloadInfo, err = NewDownloadInfo(kDownloadInfoFile) if err != nil { util.Fatal("%s\n", err.Error()) } } h := md5.New() h.Write(bs) hash := fmt.Sprintf("%x", h.Sum(nil)) data := DownloadData{} task.GetData(&data) if dup, fid := duplicate(taskset.GetDB(), hash); dup { setFileId(taskset.GetDB(), data.Pid, fid) return } iv, bs := encrypt(bs, gDownloadInfo.EncryptKey) fileData := FileData{} fileData.FileName = fmt.Sprintf("%x%s", iv[:4], hash[:24]) fileData.Hash = hash err := gDownloadInfo.Bucket.PutObject(fileData.FileName, bytes.NewReader(bs)) if err != nil { util.Error("can't create file %s\n", err.Error()) } else { util.Info("downloading %s\n", task.Url) taskset.GetDB().Create(&fileData) setFileId(taskset.GetDB(), data.Pid, fileData.ID) } }