func retryTask(c context.Context, ds appwrap.Datastore, taskIntf TaskInterface, jobKey *datastore.Key, taskKey *datastore.Key) error { var job JobInfo if j, err := getJob(ds, jobKey); err != nil { return fmt.Errorf("getting job: %s", err) } else { job = j } time.Sleep(time.Duration(job.RetryCount) * 5 * time.Second) if err := backoff.Retry(func() error { var task JobTask if err := ds.Get(taskKey, &task); err != nil { return fmt.Errorf("getting task: %s", err) } task.Status = TaskStatusPending if _, err := ds.Put(taskKey, &task); err != nil { return fmt.Errorf("putting task: %s", err) } else if err := taskIntf.PostTask(c, task.Url, job.JsonParameters); err != nil { return fmt.Errorf("enqueuing task: %s", err) } logInfo(c, "retrying task %d/%d", task.Retries, job.RetryCount) return nil }, mrBackOff()); err != nil { logInfo(c, "retryTask() failed after backoff attempts") return err } else { return nil } }
func createTasks(ds appwrap.Datastore, jobKey *datastore.Key, taskKeys []*datastore.Key, tasks []JobTask, newStage JobStage) error { now := time.Now() firstId := taskKeys[0].IntID() for i := range tasks { tasks[i].StartTime = now tasks[i].Job = jobKey if taskKeys[i].IntID() < firstId { firstId = taskKeys[i].IntID() } } putSize := 64 i := 0 for i < len(tasks) { if err := backoff.Retry(func() error { last := i + putSize if last > len(tasks) { last = len(tasks) } if _, err := ds.PutMulti(taskKeys[i:last], tasks[i:last]); err != nil { if putSize > 5 { putSize /= 2 } return err } i = last return nil }, mrBackOff()); err != nil { return err } } return runInTransaction(ds, func(ds appwrap.Datastore) error { var job JobInfo if err := ds.Get(jobKey, &job); err != nil { return err } job.TaskCount = len(tasks) job.FirstTaskId = firstId job.Stage = newStage _, err := ds.Put(jobKey, &job) return err }) }
func updateTask(ds appwrap.Datastore, taskKey *datastore.Key, status TaskStatus, tryIncrement int, info string, result interface{}) (JobTask, error) { var task JobTask newCount := -1 err := backoff.Retry(func() error { if err := ds.Get(taskKey, &task); err != nil { return err } task.UpdatedAt = time.Now() task.Info = info // this prevents double incrementing if the Put times out but has actually // written the value if newCount == -1 { newCount = task.Retries + tryIncrement } task.Retries = newCount if status != "" { task.Status = status if status == TaskStatusDone || task.Status == TaskStatusFailed { task.Done = task.Job } } if result != nil { resultBytes, err := json.Marshal(result) if err != nil { return err } task.Result = string(resultBytes) } _, err := ds.Put(taskKey, &task) return err }, mrBackOff()) return task, err }
func markJobFailed(c context.Context, ds appwrap.Datastore, jobKey *datastore.Key) (prev JobInfo, finalErr error) { finalErr = runInTransaction(ds, func(ds appwrap.Datastore) error { prev = JobInfo{} if err := ds.Get(jobKey, &prev); err != nil { return err } job := prev job.Stage = StageFailed _, err := ds.Put(jobKey, &job) return err }) if finalErr != nil { logCritical(c, "marking job failed for key %s failed: %s", jobKey, finalErr) } return }
func createJob(ds appwrap.Datastore, urlPrefix string, writerNames []string, onCompleteUrl string, separateReduceItems bool, jsonParameters string, retryCount int) (*datastore.Key, error) { if retryCount == 0 { // default retryCount = 3 } key := ds.NewKey(JobEntity, "", 0, nil) job := JobInfo{ UrlPrefix: urlPrefix, Stage: StageFormation, UpdatedAt: time.Now(), StartTime: time.Now(), OnCompleteUrl: onCompleteUrl, SeparateReduceItems: separateReduceItems, WriterNames: writerNames, RetryCount: retryCount, JsonParameters: jsonParameters, } return ds.Put(key, &job) }
// check if the specified job has completed. it should currently be at expectedStage, and if it's been completed // we advance it to next stage. if it's already at nextStage another process has beaten us to it so we're done // // caller needs to check the stage in the final job; if stageChanged is true it will be either nextStage or StageFailed. // If StageFailed then at least one of the underlying tasks failed and the reason will appear as a taskError{} in err func jobStageComplete(c context.Context, ds appwrap.Datastore, jobKey *datastore.Key, taskKeys []*datastore.Key, expectedStage, nextStage JobStage) (stageChanged bool, job JobInfo, finalErr error) { last := len(taskKeys) tasks := make([]JobTask, 100) for last > 0 { first := last - 100 if first < 0 { first = 0 } taskCount := last - first if err := ds.GetMulti(taskKeys[first:last], tasks[0:taskCount]); err != nil { finalErr = err return } else { for i := 0; i < taskCount; i++ { if tasks[i].Status == TaskStatusFailed { logInfo(c, "failed tasks found") nextStage = StageFailed last = -1 finalErr = taskError{tasks[i].Info} break } else if tasks[i].Status != TaskStatusDone { return } } if last >= 0 { last = first } } } // running this in a transaction ensures only one process advances the stage if transErr := runInTransaction(ds, func(ds appwrap.Datastore) error { job = JobInfo{} if err := ds.Get(jobKey, &job); err != nil { return err } if job.Stage != expectedStage { // we're not where we expected, so advancing this isn't our responsibility stageChanged = false return errMonitorJobConflict } job.Stage = nextStage job.UpdatedAt = time.Now() _, err := ds.Put(jobKey, &job) stageChanged = (err == nil) return err }); transErr != nil { finalErr = transErr } if finalErr != nil { logCritical(c, "taskComplete failed: %s", finalErr) } else { logInfo(c, "task is complete") } return }