func uwuRenew(attempt coordinate.Attempt, uwuOptions UpdateWorkUnitOptions) error { leaseTime := uwuOptions.LeaseTime if leaseTime < 1 { leaseTime = 300 } extendDuration := time.Duration(leaseTime) * time.Second return attempt.Renew(extendDuration, uwuOptions.Data) }
func getWorkTuple(attempt coordinate.Attempt) (cborrpc.PythonTuple, error) { data, err := attempt.Data() if err != nil { return cborrpc.PythonTuple{}, err } workUnit := attempt.WorkUnit() return cborrpc.PythonTuple{Items: []interface{}{ workUnit.WorkSpec().Name(), []byte(workUnit.Name()), data, }}, nil }
// attemptMap turns a single attempt into the map returned by // GetChildWorkUnits(). func attemptMap(attempt coordinate.Attempt) (map[string]interface{}, error) { // First try to swap out attempt for its work unit's actual // active attempt. workUnit := attempt.WorkUnit() activeAttempt, err := workUnit.ActiveAttempt() if err != nil { return nil, err } if activeAttempt != nil { attempt = activeAttempt } // Collect extra data we need and build the result data, err := attempt.Data() if err != nil { return nil, err } expires, err := attempt.ExpirationTime() if err != nil { return nil, err } result := map[string]interface{}{ "work_spec_name": workUnit.WorkSpec().Name(), "work_unit_key": []byte(workUnit.Name()), "work_unit_data": data, "worker_id": attempt.Worker().Name(), "expires": expires.Unix(), } return result, nil }
// UpdateWorkUnit causes some state change in a work unit. If the // work unit is pending, this is the principal interface to complete // or renew it; if it is already complete this can cause it to be // retried. func (jobs *JobServer) UpdateWorkUnit( workSpecName string, workUnitKey string, options map[string]interface{}, ) (bool, string, error) { // Note that in several corner cases, the behavior of this as // written disagrees with Python coordinated's: // // * If neither "lease_time" nor "status" is specified, // Python coordinated immediately returns False without // checking if workUnitKey is valid // // * Python coordinated allows arbitrary status changes, // including AVAILABLE -> FINISHED // // * This openly ignores "worker_id", as distinct from Python // coordinated, which logs an obscure warning and changes it, // but only on a renew var ( attempt coordinate.Attempt changed bool err error status coordinate.AttemptStatus uwuOptions UpdateWorkUnitOptions workSpec coordinate.WorkSpec workUnit coordinate.WorkUnit ) err = decode(&uwuOptions, options) if err == nil { workSpec, err = jobs.Namespace.WorkSpec(workSpecName) } if err == nil { workUnit, err = workSpec.WorkUnit(workUnitKey) } if err == nil { if workUnit == nil { return false, fmt.Sprintf("no such work unit key=%v", workUnitKey), nil } } if err == nil { attempt, err = workUnit.ActiveAttempt() } if err == nil && attempt != nil { status, err = attempt.Status() } if err == nil && attempt != nil { if status == coordinate.Expired || status == coordinate.Retryable { // The Python Coordinate API sees both of these // statuses as "available", and we want to fall // into the next block. attempt = nil } } if err == nil && attempt == nil { // Caller is trying to manipulate an AVAILABLE work // unit. Cowardly refuse to start a new attempt on // their behalf, or to update the persistent work unit // data this way. (In theory there's no reason we // *couldn't* do either, though I'm not aware of any // callers that do; add_work_unit will replace // existing work units and is the more typical way to // refresh data.) err = errors.New("update_work_unit will not adjust an available work unit") } if err == nil { switch status { case coordinate.Pending: changed = true // or there's an error switch uwuOptions.Status { case 0, Pending: err = uwuRenew(attempt, uwuOptions) case Available: err = attempt.Expire(uwuOptions.Data) case Finished: err = attempt.Finish(uwuOptions.Data) case Failed: err = attempt.Fail(uwuOptions.Data) default: err = errors.New("update_work_unit invalid status") } case coordinate.Expired: err = errors.New("update_work_unit logic error, trying to refresh expired unit") case coordinate.Finished: switch uwuOptions.Status { case 0, Finished: changed = false // no-op case Available: err = workUnit.ClearActiveAttempt() changed = true case Failed: changed = false // see below default: err = errors.New("update_work_unit cannot change finished unit") } case coordinate.Failed: switch uwuOptions.Status { case 0, Failed: changed = false // no-op case Available: // "retry" err = workUnit.ClearActiveAttempt() changed = true case Finished: // The Python worker, with two separate // processes, has a race wherein there // could be 15 seconds to go, the parent // kills off the child, and the child // finishes successfully, all at the same // time. In that case the successful // finish should win. err = attempt.Finish(nil) changed = true default: err = errors.New("update_work_unit cannot change failed unit") } case coordinate.Retryable: err = errors.New("update_work_unit logic error, trying to refresh retryable unit") default: err = fmt.Errorf("update_work_unit invalid attempt status %+v", status) } } return changed && err == nil, "", err }
// TestAttemptLifetime validates a basic attempt lifetime. func (s *Suite) TestAttemptLifetime(c *check.C) { var ( err error data map[string]interface{} attempt, attempt2 coordinate.Attempt aStatus coordinate.AttemptStatus spec coordinate.WorkSpec unit coordinate.WorkUnit worker coordinate.Worker uStatus coordinate.WorkUnitStatus ) spec, worker = s.makeWorkSpecAndWorker(c) // Create a work unit unit, err = spec.AddWorkUnit("a", map[string]interface{}{}, 0.0) c.Assert(err, check.IsNil) // The work unit should be "available" uStatus, err = unit.Status() c.Assert(err, check.IsNil) c.Check(uStatus, check.Equals, coordinate.AvailableUnit) // The work unit data should be defined but empty data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.HasLen, 0) // Get an attempt for it attempts, err := worker.RequestAttempts(coordinate.AttemptRequest{}) c.Assert(err, check.IsNil) c.Assert(attempts, check.HasLen, 1) attempt = attempts[0] // The work unit should be "pending" uStatus, err = unit.Status() c.Assert(err, check.IsNil) c.Check(uStatus, check.Equals, coordinate.PendingUnit) // The attempt should be "pending" too aStatus, err = attempt.Status() c.Assert(err, check.IsNil) c.Check(aStatus, check.Equals, coordinate.Pending) // The active attempt for the unit should match this attempt2, err = unit.ActiveAttempt() c.Assert(err, check.IsNil) c.Check(attempt2, AttemptMatches, attempt) // There should be one active attempt for the worker and it should // also match attempts, err = worker.ActiveAttempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 1) if len(attempts) > 0 { c.Check(attempts[0], AttemptMatches, attempt) } // The work unit data should (still) be defined but empty data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.HasLen, 0) // Now finish the attempt with some updated data err = attempt.Finish(map[string]interface{}{ "outputs": []string{"yes"}, }) c.Assert(err, check.IsNil) // The unit should report "finished" uStatus, err = unit.Status() c.Assert(err, check.IsNil) c.Check(uStatus, check.Equals, coordinate.FinishedUnit) // The attempt should report "finished" aStatus, err = attempt.Status() c.Assert(err, check.IsNil) c.Check(aStatus, check.Equals, coordinate.Finished) // The attempt should still be the active attempt for the unit attempt2, err = unit.ActiveAttempt() c.Assert(err, check.IsNil) c.Check(attempt2, AttemptMatches, attempt) // The attempt should not be in the active attempt list for the worker attempts, err = worker.ActiveAttempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 0) // Both the unit and the worker should have one archived attempt attempts, err = unit.Attempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 1) if len(attempts) > 0 { c.Check(attempts[0], AttemptMatches, attempt) } attempts, err = worker.AllAttempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 1) if len(attempts) > 0 { c.Check(attempts[0], AttemptMatches, attempt) } // This should have updated the visible work unit data too data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.HasLen, 1) c.Check(data["outputs"], check.HasLen, 1) c.Check(reflect.ValueOf(data["outputs"]).Index(0).Interface(), check.Equals, "yes") // For bonus points, force-clear the active attempt err = unit.ClearActiveAttempt() c.Assert(err, check.IsNil) // This should have pushed the unit back to available uStatus, err = unit.Status() c.Assert(err, check.IsNil) c.Check(uStatus, check.Equals, coordinate.AvailableUnit) // This also should have reset the work unit data data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.HasLen, 0) // But, this should not have reset the historical attempts attempts, err = unit.Attempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 1) if len(attempts) > 0 { c.Check(attempts[0], AttemptMatches, attempt) } attempts, err = worker.AllAttempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 1) if len(attempts) > 0 { c.Check(attempts[0], AttemptMatches, attempt) } }