// getRunnableAndDeadSlots returns two maps of slots. The first contains slots // that have at least one task with a desired state above NEW and lesser or // equal to RUNNING. The second is for slots that only contain tasks with a // desired state above RUNNING. func getRunnableAndDeadSlots(s *store.MemoryStore, serviceID string) (map[uint64]slot, map[uint64]slot, error) { var ( tasks []*api.Task err error ) s.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByServiceID(serviceID)) }) if err != nil { return nil, nil, err } runningSlots := make(map[uint64]slot) for _, t := range tasks { if t.DesiredState <= api.TaskStateRunning { runningSlots[t.Slot] = append(runningSlots[t.Slot], t) } } deadSlots := make(map[uint64]slot) for _, t := range tasks { if _, exists := runningSlots[t.Slot]; !exists { deadSlots[t.Slot] = append(deadSlots[t.Slot], t) } } return runningSlots, deadSlots, nil }
func deleteServiceTasks(ctx context.Context, s *store.MemoryStore, service *api.Service) { var ( tasks []*api.Task err error ) s.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID)) }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to list tasks") return } _, err = s.Batch(func(batch *store.Batch) error { for _, t := range tasks { err := batch.Update(func(tx store.Tx) error { if err := store.DeleteTask(tx, t.ID); err != nil { log.G(ctx).WithError(err).Errorf("failed to delete task") } return nil }) if err != nil { return err } } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("task search transaction failed") } }
func (g *GlobalOrchestrator) reconcileOneService(ctx context.Context, service *api.Service) { var ( tasks []*api.Task err error ) g.store.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID)) }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileOneService failed finding tasks") return } // a node may have completed this service nodeCompleted := make(map[string]struct{}) // nodeID -> task list nodeTasks := make(map[string][]*api.Task) for _, t := range tasks { if isTaskRunning(t) { // Collect all running instances of this service nodeTasks[t.NodeID] = append(nodeTasks[t.NodeID], t) } else { // for finished tasks, check restartPolicy if isTaskCompleted(t, restartCondition(t)) { nodeCompleted[t.NodeID] = struct{}{} } } } _, err = g.store.Batch(func(batch *store.Batch) error { var updateTasks []*api.Task for nodeID := range g.nodes { ntasks := nodeTasks[nodeID] // if restart policy considers this node has finished its task // it should remove all running tasks if _, exists := nodeCompleted[nodeID]; exists { g.removeTasks(ctx, batch, service, ntasks) return nil } // this node needs to run 1 copy of the task if len(ntasks) == 0 { g.addTask(ctx, batch, service, nodeID) } else { updateTasks = append(updateTasks, ntasks[0]) g.removeTasks(ctx, batch, service, ntasks[1:]) } } if len(updateTasks) > 0 { g.updater.Update(ctx, service, updateTasks) } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileOneService transaction failed") } }
func (r *ReplicatedOrchestrator) reconcile(ctx context.Context, service *api.Service) { var ( tasks []*api.Task err error ) r.store.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID)) }) if err != nil { log.G(ctx).WithError(err).Errorf("reconcile failed finding tasks") return } runningTasks := make([]*api.Task, 0, len(tasks)) runningInstances := make(map[uint64]struct{}) // this could be a bitfield... for _, t := range tasks { // Technically the check below could just be // t.DesiredState <= api.TaskStateRunning, but ignoring tasks // with DesiredState == NEW simplifies the drainer unit tests. if t.DesiredState > api.TaskStateNew && t.DesiredState <= api.TaskStateRunning { runningTasks = append(runningTasks, t) runningInstances[t.Slot] = struct{}{} } } numTasks := len(runningTasks) deploy := service.Spec.GetMode().(*api.ServiceSpec_Replicated) specifiedInstances := int(deploy.Replicated.Replicas) // TODO(aaronl): Add support for restart delays. _, err = r.store.Batch(func(batch *store.Batch) error { switch { case specifiedInstances > numTasks: log.G(ctx).Debugf("Service %s was scaled up from %d to %d instances", service.ID, numTasks, specifiedInstances) // Update all current tasks then add missing tasks r.updater.Update(ctx, service, runningTasks) r.addTasks(ctx, batch, service, runningInstances, specifiedInstances-numTasks) case specifiedInstances < numTasks: // Update up to N tasks then remove the extra log.G(ctx).Debugf("Service %s was scaled down from %d to %d instances", service.ID, numTasks, specifiedInstances) r.updater.Update(ctx, service, runningTasks[:specifiedInstances]) r.removeTasks(ctx, batch, service, runningTasks[specifiedInstances:]) case specifiedInstances == numTasks: // Simple update, no scaling - update all tasks. r.updater.Update(ctx, service, runningTasks) } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("reconcile batch failed") } }
func getRunnableServiceTasks(t *testing.T, s *store.MemoryStore, service *api.Service) []*api.Task { var ( err error tasks []*api.Task ) s.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID)) }) assert.NoError(t, err) runnable := []*api.Task{} for _, task := range tasks { if task.DesiredState == api.TaskStateRunning { runnable = append(runnable, task) } } return runnable }
func (s *subscription) match() { s.mu.Lock() defer s.mu.Unlock() add := func(t *api.Task) { if t.NodeID == "" { s.pendingTasks[t.ID] = struct{}{} return } if _, ok := s.nodes[t.NodeID]; !ok { s.nodes[t.NodeID] = struct{}{} s.wg.Add(1) } } s.store.View(func(tx store.ReadTx) { for _, nid := range s.message.Selector.NodeIDs { s.nodes[nid] = struct{}{} } for _, tid := range s.message.Selector.TaskIDs { if task := store.GetTask(tx, tid); task != nil { add(task) } } for _, sid := range s.message.Selector.ServiceIDs { tasks, err := store.FindTasks(tx, store.ByServiceID(sid)) if err != nil { log.L.Warning(err) continue } for _, task := range tasks { add(task) } } }) }
// getRunnableSlots returns a map of slots that have at least one task with // a desired state above NEW and lesser or equal to RUNNING. func getRunnableSlots(s *store.MemoryStore, serviceID string) (map[uint64]slot, error) { var ( tasks []*api.Task err error ) s.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByServiceID(serviceID)) }) if err != nil { return nil, err } runningSlots := make(map[uint64]slot) for _, t := range tasks { // Technically the check below could just be // t.DesiredState <= api.TaskStateRunning, but ignoring tasks // with DesiredState == NEW simplifies the drainer unit tests. if t.DesiredState > api.TaskStateNew && t.DesiredState <= api.TaskStateRunning { runningSlots[t.Slot] = append(runningSlots[t.Slot], t) } } return runningSlots, nil }
func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []string) { nodeCompleted := make(map[string]map[string]struct{}) nodeTasks := make(map[string]map[string][]*api.Task) g.store.View(func(tx store.ReadTx) { for _, serviceID := range serviceIDs { tasks, err := store.FindTasks(tx, store.ByServiceID(serviceID)) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServices failed finding tasks for service %s", serviceID) continue } // a node may have completed this service nodeCompleted[serviceID] = make(map[string]struct{}) // nodeID -> task list nodeTasks[serviceID] = make(map[string][]*api.Task) for _, t := range tasks { if isTaskRunning(t) { // Collect all running instances of this service nodeTasks[serviceID][t.NodeID] = append(nodeTasks[serviceID][t.NodeID], t) } else { // for finished tasks, check restartPolicy if isTaskCompleted(t, orchestrator.RestartCondition(t)) { nodeCompleted[serviceID][t.NodeID] = struct{}{} } } } } }) _, err := g.store.Batch(func(batch *store.Batch) error { var updateTasks []orchestrator.Slot for _, serviceID := range serviceIDs { if _, exists := nodeTasks[serviceID]; !exists { continue } service := g.globalServices[serviceID] for nodeID, node := range g.nodes { meetsConstraints := constraint.NodeMatches(service.constraints, node) ntasks := nodeTasks[serviceID][nodeID] delete(nodeTasks[serviceID], nodeID) // if restart policy considers this node has finished its task // it should remove all running tasks if _, exists := nodeCompleted[serviceID][nodeID]; exists || !meetsConstraints { g.removeTasks(ctx, batch, ntasks) continue } if node.Spec.Availability == api.NodeAvailabilityPause { // the node is paused, so we won't add or update // any tasks continue } // this node needs to run 1 copy of the task if len(ntasks) == 0 { g.addTask(ctx, batch, service.Service, nodeID) } else { updateTasks = append(updateTasks, ntasks) } } if len(updateTasks) > 0 { g.updater.Update(ctx, g.cluster, service.Service, updateTasks) } // Remove any tasks assigned to nodes not found in g.nodes. // These must be associated with nodes that are drained, or // nodes that no longer exist. for _, ntasks := range nodeTasks[serviceID] { g.removeTasks(ctx, batch, ntasks) } } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServices transaction failed") } }
func TestReplicatedOrchestrator(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) defer s.Close() orchestrator := NewReplicatedOrchestrator(s) defer orchestrator.Stop() watch, cancel := state.Watch(s.WatchQueue() /*state.EventCreateTask{}, state.EventUpdateTask{}*/) defer cancel() // Create a service with two instances specified before the orchestrator is // started. This should result in two tasks when the orchestrator // starts up. err := s.Update(func(tx store.Tx) error { s1 := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{}, }, }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 2, }, }, }, } assert.NoError(t, store.CreateService(tx, s1)) return nil }) assert.NoError(t, err) // Start the orchestrator. go func() { assert.NoError(t, orchestrator.Run(ctx)) }() observedTask1 := watchTaskCreate(t, watch) assert.Equal(t, observedTask1.Status.State, api.TaskStateNew) assert.Equal(t, observedTask1.ServiceAnnotations.Name, "name1") observedTask2 := watchTaskCreate(t, watch) assert.Equal(t, observedTask2.Status.State, api.TaskStateNew) assert.Equal(t, observedTask2.ServiceAnnotations.Name, "name1") // Create a second service. err = s.Update(func(tx store.Tx) error { s2 := &api.Service{ ID: "id2", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name2", }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{}, }, }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 1, }, }, }, } assert.NoError(t, store.CreateService(tx, s2)) return nil }) assert.NoError(t, err) observedTask3 := watchTaskCreate(t, watch) assert.Equal(t, observedTask3.Status.State, api.TaskStateNew) assert.Equal(t, observedTask3.ServiceAnnotations.Name, "name2") // Update a service to scale it out to 3 instances err = s.Update(func(tx store.Tx) error { s2 := &api.Service{ ID: "id2", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name2", }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{}, }, }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 3, }, }, }, } assert.NoError(t, store.UpdateService(tx, s2)) return nil }) assert.NoError(t, err) observedTask4 := watchTaskCreate(t, watch) assert.Equal(t, observedTask4.Status.State, api.TaskStateNew) assert.Equal(t, observedTask4.ServiceAnnotations.Name, "name2") observedTask5 := watchTaskCreate(t, watch) assert.Equal(t, observedTask5.Status.State, api.TaskStateNew) assert.Equal(t, observedTask5.ServiceAnnotations.Name, "name2") // Now scale it back down to 1 instance err = s.Update(func(tx store.Tx) error { s2 := &api.Service{ ID: "id2", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name2", }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{}, }, }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 1, }, }, }, } assert.NoError(t, store.UpdateService(tx, s2)) return nil }) assert.NoError(t, err) observedDeletion1 := watchShutdownTask(t, watch) assert.Equal(t, observedDeletion1.Status.State, api.TaskStateNew) assert.Equal(t, observedDeletion1.ServiceAnnotations.Name, "name2") observedDeletion2 := watchShutdownTask(t, watch) assert.Equal(t, observedDeletion2.Status.State, api.TaskStateNew) assert.Equal(t, observedDeletion2.ServiceAnnotations.Name, "name2") // There should be one remaining task attached to service id2/name2. var liveTasks []*api.Task s.View(func(readTx store.ReadTx) { var tasks []*api.Task tasks, err = store.FindTasks(readTx, store.ByServiceID("id2")) for _, t := range tasks { if t.DesiredState == api.TaskStateRunning { liveTasks = append(liveTasks, t) } } }) assert.NoError(t, err) assert.Len(t, liveTasks, 1) // Delete the remaining task directly. It should be recreated by the // orchestrator. err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.DeleteTask(tx, liveTasks[0].ID)) return nil }) assert.NoError(t, err) observedTask6 := watchTaskCreate(t, watch) assert.Equal(t, observedTask6.Status.State, api.TaskStateNew) assert.Equal(t, observedTask6.ServiceAnnotations.Name, "name2") // Delete the service. Its remaining task should go away. err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.DeleteService(tx, "id2")) return nil }) assert.NoError(t, err) deletedTask := watchTaskDelete(t, watch) assert.Equal(t, deletedTask.Status.State, api.TaskStateNew) assert.Equal(t, deletedTask.ServiceAnnotations.Name, "name2") }
func (r *ReplicatedOrchestrator) reconcile(ctx context.Context, service *api.Service) { var ( tasks []*api.Task err error ) r.store.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID)) }) if err != nil { log.G(ctx).WithError(err).Errorf("reconcile failed finding tasks") return } runningTasks := make([]*api.Task, 0, len(tasks)) runningInstances := make(map[uint64]struct{}) // this could be a bitfield... for _, t := range tasks { // Technically the check below could just be // t.DesiredState <= api.TaskStateRunning, but ignoring tasks // with DesiredState == NEW simplifies the drainer unit tests. if t.DesiredState > api.TaskStateNew && t.DesiredState <= api.TaskStateRunning { runningTasks = append(runningTasks, t) runningInstances[t.Slot] = struct{}{} } } numTasks := len(runningTasks) deploy := service.Spec.GetMode().(*api.ServiceSpec_Replicated) specifiedInstances := int(deploy.Replicated.Replicas) switch { case specifiedInstances > numTasks: log.G(ctx).Debugf("Service %s was scaled up from %d to %d instances", service.ID, numTasks, specifiedInstances) // Update all current tasks then add missing tasks r.updater.Update(ctx, r.cluster, service, runningTasks) _, err = r.store.Batch(func(batch *store.Batch) error { r.addTasks(ctx, batch, service, runningInstances, specifiedInstances-numTasks) return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("reconcile batch failed") } case specifiedInstances < numTasks: // Update up to N tasks then remove the extra log.G(ctx).Debugf("Service %s was scaled down from %d to %d instances", service.ID, numTasks, specifiedInstances) // Preferentially remove tasks on the nodes that have the most // copies of this service, to leave a more balanced result. // First sort tasks such that tasks which are currently running // (in terms of observed state) appear before non-running tasks. // This will cause us to prefer to remove non-running tasks, all // other things being equal in terms of node balance. sort.Sort(tasksByRunningState(runningTasks)) // Assign each task an index that counts it as the nth copy of // of the service on its node (1, 2, 3, ...), and sort the // tasks by this counter value. instancesByNode := make(map[string]int) tasksWithIndices := make(tasksByIndex, 0, numTasks) for _, t := range runningTasks { if t.NodeID != "" { instancesByNode[t.NodeID]++ tasksWithIndices = append(tasksWithIndices, taskWithIndex{task: t, index: instancesByNode[t.NodeID]}) } else { tasksWithIndices = append(tasksWithIndices, taskWithIndex{task: t, index: -1}) } } sort.Sort(tasksWithIndices) sortedTasks := make([]*api.Task, 0, numTasks) for _, t := range tasksWithIndices { sortedTasks = append(sortedTasks, t.task) } r.updater.Update(ctx, r.cluster, service, sortedTasks[:specifiedInstances]) _, err = r.store.Batch(func(batch *store.Batch) error { r.removeTasks(ctx, batch, service, sortedTasks[specifiedInstances:]) return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("reconcile batch failed") } case specifiedInstances == numTasks: // Simple update, no scaling - update all tasks. r.updater.Update(ctx, r.cluster, service, runningTasks) } }