// workUnitStatus extracts a summary of the status of a single work // unit. This produces its external coordinate status and the active // attempt (if any) on success. func workUnitStatus(workUnit coordinate.WorkUnit) (status WorkUnitStatus, attempt coordinate.Attempt, err error) { var attemptStatus coordinate.AttemptStatus attempt, err = workUnit.ActiveAttempt() if err == nil && attempt == nil { status = Available return } if err == nil { attemptStatus, err = attempt.Status() } if err == nil { switch attemptStatus { case coordinate.Pending: status = Pending case coordinate.Expired: status = Available attempt = nil case coordinate.Finished: status = Finished case coordinate.Failed: status = Failed case coordinate.Retryable: status = Available attempt = nil default: err = errors.New("unexpected attempt status") } } return }
// TestChainingExpiry tests that, if an attempt finishes but is no // longer the active attempt, then its successor work units will not // be created. func (s *Suite) TestChainingExpiry(c *check.C) { var ( one, two coordinate.WorkSpec err error worker coordinate.Worker unit coordinate.WorkUnit attempts []coordinate.Attempt ) one, err = s.Namespace.SetWorkSpec(map[string]interface{}{ "name": "one", "then": "two", }) c.Assert(err, check.IsNil) two, err = s.Namespace.SetWorkSpec(map[string]interface{}{ "name": "two", "disabled": true, }) c.Assert(err, check.IsNil) worker, err = s.Namespace.Worker("worker") c.Assert(err, check.IsNil) // Create and perform a work unit, with no output unit, err = one.AddWorkUnit("a", map[string]interface{}{}, 0.0) c.Assert(err, check.IsNil) attempts, err = worker.RequestAttempts(coordinate.AttemptRequest{}) c.Assert(err, check.IsNil) c.Assert(attempts, check.HasLen, 1) attempt := attempts[0] // But wait! We got preempted err = unit.ClearActiveAttempt() c.Assert(err, check.IsNil) attempts, err = worker.RequestAttempts(coordinate.AttemptRequest{}) c.Assert(err, check.IsNil) c.Assert(attempts, check.HasLen, 1) // Now, let the original attempt finish, trying to generate // more outputs err = attempt.Finish(map[string]interface{}{ "output": []string{"unit"}, }) c.Assert(err, check.IsNil) // Since attempt is no longer active, this shouldn't generate // new outputs units, err := two.WorkUnits(coordinate.WorkUnitQuery{}) c.Assert(err, check.IsNil) c.Check(units, check.HasLen, 0) }
// TestWorkUnitData validates that the system can store and update // data. func (s *Suite) TestWorkUnitData(c *check.C) { var ( data map[string]interface{} unit coordinate.WorkUnit ) spec, err := s.Namespace.SetWorkSpec(map[string]interface{}{ "name": "spec", "min_gb": 1, }) c.Assert(err, check.IsNil) _, err = spec.AddWorkUnit("a", map[string]interface{}{ "name": "a", "value": 1, }, 0.0) c.Assert(err, check.IsNil) _, err = spec.AddWorkUnit("b", map[string]interface{}{ "name": "b", "value": 2, }, 0.0) c.Assert(err, check.IsNil) unit, err = spec.WorkUnit("a") c.Assert(err, check.IsNil) data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.HasLen, 2) c.Check(data["name"], check.Equals, "a") c.Check(data["value"], Like, 1) unit, err = spec.WorkUnit("b") c.Assert(err, check.IsNil) data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.HasLen, 2) c.Check(data["name"], check.Equals, "b") c.Check(data["value"], Like, 2) }
// TestTrivialWorkUnitFlow tests work unit creation, deletion, and existence. func (s *Suite) TestTrivialWorkUnitFlow(c *check.C) { var ( count int err error spec coordinate.WorkSpec unit coordinate.WorkUnit units map[string]coordinate.WorkUnit ) spec, err = s.Namespace.SetWorkSpec(map[string]interface{}{ "name": "spec", "min_gb": 1, }) c.Assert(err, check.IsNil) unit, err = spec.AddWorkUnit("unit", map[string]interface{}{}, 0) c.Assert(err, check.IsNil) c.Check(unit.Name(), check.Equals, "unit") c.Check(unit.WorkSpec().Name(), check.Equals, "spec") unit, err = spec.WorkUnit("unit") c.Assert(err, check.IsNil) c.Check(unit.Name(), check.Equals, "unit") c.Check(unit.WorkSpec().Name(), check.Equals, "spec") units, err = spec.WorkUnits(coordinate.WorkUnitQuery{}) c.Assert(err, check.IsNil) c.Check(units, check.HasLen, 1) c.Check(units["unit"], check.NotNil) c.Check(units["unit"].Name(), check.Equals, "unit") c.Check(units["unit"].WorkSpec().Name(), check.Equals, "spec") count, err = spec.DeleteWorkUnits(coordinate.WorkUnitQuery{}) c.Assert(err, check.IsNil) c.Check(count, check.Equals, 1) unit, err = spec.WorkUnit("unit") c.Assert(err, check.IsNil) c.Check(unit, check.IsNil) units, err = spec.WorkUnits(coordinate.WorkUnitQuery{}) c.Assert(err, check.IsNil) c.Check(units, check.HasLen, 0) }
// UpdateWorkUnit causes some state change in a work unit. If the // work unit is pending, this is the principal interface to complete // or renew it; if it is already complete this can cause it to be // retried. func (jobs *JobServer) UpdateWorkUnit( workSpecName string, workUnitKey string, options map[string]interface{}, ) (bool, string, error) { // Note that in several corner cases, the behavior of this as // written disagrees with Python coordinated's: // // * If neither "lease_time" nor "status" is specified, // Python coordinated immediately returns False without // checking if workUnitKey is valid // // * Python coordinated allows arbitrary status changes, // including AVAILABLE -> FINISHED // // * This openly ignores "worker_id", as distinct from Python // coordinated, which logs an obscure warning and changes it, // but only on a renew var ( attempt coordinate.Attempt changed bool err error status coordinate.AttemptStatus uwuOptions UpdateWorkUnitOptions workSpec coordinate.WorkSpec workUnit coordinate.WorkUnit ) err = decode(&uwuOptions, options) if err == nil { workSpec, err = jobs.Namespace.WorkSpec(workSpecName) } if err == nil { workUnit, err = workSpec.WorkUnit(workUnitKey) } if err == nil { if workUnit == nil { return false, fmt.Sprintf("no such work unit key=%v", workUnitKey), nil } } if err == nil { attempt, err = workUnit.ActiveAttempt() } if err == nil && attempt != nil { status, err = attempt.Status() } if err == nil && attempt != nil { if status == coordinate.Expired || status == coordinate.Retryable { // The Python Coordinate API sees both of these // statuses as "available", and we want to fall // into the next block. attempt = nil } } if err == nil && attempt == nil { // Caller is trying to manipulate an AVAILABLE work // unit. Cowardly refuse to start a new attempt on // their behalf, or to update the persistent work unit // data this way. (In theory there's no reason we // *couldn't* do either, though I'm not aware of any // callers that do; add_work_unit will replace // existing work units and is the more typical way to // refresh data.) err = errors.New("update_work_unit will not adjust an available work unit") } if err == nil { switch status { case coordinate.Pending: changed = true // or there's an error switch uwuOptions.Status { case 0, Pending: err = uwuRenew(attempt, uwuOptions) case Available: err = attempt.Expire(uwuOptions.Data) case Finished: err = attempt.Finish(uwuOptions.Data) case Failed: err = attempt.Fail(uwuOptions.Data) default: err = errors.New("update_work_unit invalid status") } case coordinate.Expired: err = errors.New("update_work_unit logic error, trying to refresh expired unit") case coordinate.Finished: switch uwuOptions.Status { case 0, Finished: changed = false // no-op case Available: err = workUnit.ClearActiveAttempt() changed = true case Failed: changed = false // see below default: err = errors.New("update_work_unit cannot change finished unit") } case coordinate.Failed: switch uwuOptions.Status { case 0, Failed: changed = false // no-op case Available: // "retry" err = workUnit.ClearActiveAttempt() changed = true case Finished: // The Python worker, with two separate // processes, has a race wherein there // could be 15 seconds to go, the parent // kills off the child, and the child // finishes successfully, all at the same // time. In that case the successful // finish should win. err = attempt.Finish(nil) changed = true default: err = errors.New("update_work_unit cannot change failed unit") } case coordinate.Retryable: err = errors.New("update_work_unit logic error, trying to refresh retryable unit") default: err = fmt.Errorf("update_work_unit invalid attempt status %+v", status) } } return changed && err == nil, "", err }
// TestWorkUnitPrioritySet tests two different ways of setting work unit // priority. func (s *Suite) TestWorkUnitPrioritySet(c *check.C) { var ( err error priority float64 unit coordinate.WorkUnit ) spec, worker := s.makeWorkSpecAndWorker(c) unit, err = spec.AddWorkUnit("a", map[string]interface{}{}, 0.0) c.Assert(err, check.IsNil) priority, err = unit.Priority() c.Assert(err, check.IsNil) c.Check(priority, check.Equals, 0.0) unit, err = spec.AddWorkUnit("b", map[string]interface{}{}, 0.0) c.Assert(err, check.IsNil) err = unit.SetPriority(10.0) c.Assert(err, check.IsNil) priority, err = unit.Priority() c.Assert(err, check.IsNil) c.Check(priority, check.Equals, 10.0) unit, err = spec.AddWorkUnit("c", map[string]interface{}{}, 0.0) c.Assert(err, check.IsNil) err = spec.SetWorkUnitPriorities(coordinate.WorkUnitQuery{ Names: []string{"c"}, }, 20.0) c.Assert(err, check.IsNil) priority, err = unit.Priority() c.Assert(err, check.IsNil) c.Check(priority, check.Equals, 20.0) unit, err = spec.AddWorkUnit("d", map[string]interface{}{}, 0.0) c.Assert(err, check.IsNil) err = spec.AdjustWorkUnitPriorities(coordinate.WorkUnitQuery{ Names: []string{"d"}, }, 20.0) priority, err = unit.Priority() c.Assert(err, check.IsNil) c.Check(priority, check.Equals, 20.0) c.Assert(err, check.IsNil) err = spec.AdjustWorkUnitPriorities(coordinate.WorkUnitQuery{ Names: []string{"d"}, }, 10.0) c.Assert(err, check.IsNil) priority, err = unit.Priority() c.Assert(err, check.IsNil) c.Check(priority, check.Equals, 30.0) unit, err = spec.WorkUnit("b") c.Assert(err, check.IsNil) priority, err = unit.Priority() c.Assert(err, check.IsNil) c.Check(priority, check.Equals, 10.0) checkWorkUnitOrder(c, worker, spec, "d", "c", "b", "a") }
// TestChainingTwoStep separately renews an attempt to insert an output // key, then finishes the work unit; it should still chain. func (s *Suite) TestChainingTwoStep(c *check.C) { var ( one, two coordinate.WorkSpec worker coordinate.Worker attempts []coordinate.Attempt units map[string]coordinate.WorkUnit unit coordinate.WorkUnit data map[string]interface{} priority float64 ok bool err error ) one, err = s.Namespace.SetWorkSpec(map[string]interface{}{ "name": "one", "then": "two", }) c.Assert(err, check.IsNil) two, err = s.Namespace.SetWorkSpec(map[string]interface{}{ "name": "two", }) c.Assert(err, check.IsNil) worker, err = s.Namespace.Worker("worker") c.Assert(err, check.IsNil) _, err = one.AddWorkUnit("a", map[string]interface{}{}, 0.0) c.Assert(err, check.IsNil) attempts, err = worker.RequestAttempts(coordinate.AttemptRequest{}) c.Assert(err, check.IsNil) c.Assert(attempts, check.HasLen, 1) err = attempts[0].Renew(time.Duration(900)*time.Second, map[string]interface{}{ "output": []interface{}{ []byte{1, 2, 3, 4}, cborrpc.PythonTuple{Items: []interface{}{ []byte{1, 2, 3, 4}, map[interface{}]interface{}{}, map[interface{}]interface{}{ "priority": 0, }, }}, }, }) c.Assert(err, check.IsNil) err = attempts[0].Finish(nil) units, err = two.WorkUnits(coordinate.WorkUnitQuery{}) c.Assert(err, check.IsNil) c.Check(units, HasKeys, []string{"\x01\x02\x03\x04"}) if unit, ok = units["\x01\x02\x03\x04"]; ok { data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.DeepEquals, map[string]interface{}{}) priority, err = unit.Priority() c.Assert(err, check.IsNil) c.Check(priority, check.Equals, 0.0) } }
// TestChainingMixed uses a combination of strings and tuples in its // "output" data. func (s *Suite) TestChainingMixed(c *check.C) { var ( one, two coordinate.WorkSpec worker coordinate.Worker attempts []coordinate.Attempt units map[string]coordinate.WorkUnit unit coordinate.WorkUnit data map[string]interface{} priority float64 ok bool err error ) one, err = s.Namespace.SetWorkSpec(map[string]interface{}{ "name": "one", "then": "two", }) c.Assert(err, check.IsNil) two, err = s.Namespace.SetWorkSpec(map[string]interface{}{ "name": "two", }) c.Assert(err, check.IsNil) worker, err = s.Namespace.Worker("worker") c.Assert(err, check.IsNil) _, err = one.AddWorkUnit("a", map[string]interface{}{}, 0.0) c.Assert(err, check.IsNil) attempts, err = worker.RequestAttempts(coordinate.AttemptRequest{}) c.Assert(err, check.IsNil) c.Assert(attempts, check.HasLen, 1) err = attempts[0].Finish(map[string]interface{}{ "output": []interface{}{ "key", cborrpc.PythonTuple{Items: []interface{}{ "key", map[string]interface{}{ "data": "x", }, map[string]interface{}{ "priority": 10.0, }, }}, }, }) c.Assert(err, check.IsNil) units, err = two.WorkUnits(coordinate.WorkUnitQuery{}) c.Assert(err, check.IsNil) c.Check(units, HasKeys, []string{"key"}) if unit, ok = units["key"]; ok { data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.DeepEquals, map[string]interface{}{"data": "x"}) priority, err = unit.Priority() c.Assert(err, check.IsNil) c.Check(priority, check.Equals, 10.0) } }
// TestAttemptLifetime validates a basic attempt lifetime. func (s *Suite) TestAttemptLifetime(c *check.C) { var ( err error data map[string]interface{} attempt, attempt2 coordinate.Attempt aStatus coordinate.AttemptStatus spec coordinate.WorkSpec unit coordinate.WorkUnit worker coordinate.Worker uStatus coordinate.WorkUnitStatus ) spec, worker = s.makeWorkSpecAndWorker(c) // Create a work unit unit, err = spec.AddWorkUnit("a", map[string]interface{}{}, 0.0) c.Assert(err, check.IsNil) // The work unit should be "available" uStatus, err = unit.Status() c.Assert(err, check.IsNil) c.Check(uStatus, check.Equals, coordinate.AvailableUnit) // The work unit data should be defined but empty data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.HasLen, 0) // Get an attempt for it attempts, err := worker.RequestAttempts(coordinate.AttemptRequest{}) c.Assert(err, check.IsNil) c.Assert(attempts, check.HasLen, 1) attempt = attempts[0] // The work unit should be "pending" uStatus, err = unit.Status() c.Assert(err, check.IsNil) c.Check(uStatus, check.Equals, coordinate.PendingUnit) // The attempt should be "pending" too aStatus, err = attempt.Status() c.Assert(err, check.IsNil) c.Check(aStatus, check.Equals, coordinate.Pending) // The active attempt for the unit should match this attempt2, err = unit.ActiveAttempt() c.Assert(err, check.IsNil) c.Check(attempt2, AttemptMatches, attempt) // There should be one active attempt for the worker and it should // also match attempts, err = worker.ActiveAttempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 1) if len(attempts) > 0 { c.Check(attempts[0], AttemptMatches, attempt) } // The work unit data should (still) be defined but empty data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.HasLen, 0) // Now finish the attempt with some updated data err = attempt.Finish(map[string]interface{}{ "outputs": []string{"yes"}, }) c.Assert(err, check.IsNil) // The unit should report "finished" uStatus, err = unit.Status() c.Assert(err, check.IsNil) c.Check(uStatus, check.Equals, coordinate.FinishedUnit) // The attempt should report "finished" aStatus, err = attempt.Status() c.Assert(err, check.IsNil) c.Check(aStatus, check.Equals, coordinate.Finished) // The attempt should still be the active attempt for the unit attempt2, err = unit.ActiveAttempt() c.Assert(err, check.IsNil) c.Check(attempt2, AttemptMatches, attempt) // The attempt should not be in the active attempt list for the worker attempts, err = worker.ActiveAttempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 0) // Both the unit and the worker should have one archived attempt attempts, err = unit.Attempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 1) if len(attempts) > 0 { c.Check(attempts[0], AttemptMatches, attempt) } attempts, err = worker.AllAttempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 1) if len(attempts) > 0 { c.Check(attempts[0], AttemptMatches, attempt) } // This should have updated the visible work unit data too data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.HasLen, 1) c.Check(data["outputs"], check.HasLen, 1) c.Check(reflect.ValueOf(data["outputs"]).Index(0).Interface(), check.Equals, "yes") // For bonus points, force-clear the active attempt err = unit.ClearActiveAttempt() c.Assert(err, check.IsNil) // This should have pushed the unit back to available uStatus, err = unit.Status() c.Assert(err, check.IsNil) c.Check(uStatus, check.Equals, coordinate.AvailableUnit) // This also should have reset the work unit data data, err = unit.Data() c.Assert(err, check.IsNil) c.Check(data, check.HasLen, 0) // But, this should not have reset the historical attempts attempts, err = unit.Attempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 1) if len(attempts) > 0 { c.Check(attempts[0], AttemptMatches, attempt) } attempts, err = worker.AllAttempts() c.Assert(err, check.IsNil) c.Check(attempts, check.HasLen, 1) if len(attempts) > 0 { c.Check(attempts[0], AttemptMatches, attempt) } }