func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []string) { nodeCompleted := make(map[string]map[string]struct{}) nodeTasks := make(map[string]map[string][]*api.Task) g.store.View(func(tx store.ReadTx) { for _, serviceID := range serviceIDs { tasks, err := store.FindTasks(tx, store.ByServiceID(serviceID)) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServices failed finding tasks for service %s", serviceID) continue } // a node may have completed this service nodeCompleted[serviceID] = make(map[string]struct{}) // nodeID -> task list nodeTasks[serviceID] = make(map[string][]*api.Task) for _, t := range tasks { if isTaskRunning(t) { // Collect all running instances of this service nodeTasks[serviceID][t.NodeID] = append(nodeTasks[serviceID][t.NodeID], t) } else { // for finished tasks, check restartPolicy if isTaskCompleted(t, orchestrator.RestartCondition(t)) { nodeCompleted[serviceID][t.NodeID] = struct{}{} } } } } }) _, err := g.store.Batch(func(batch *store.Batch) error { var updateTasks []orchestrator.Slot for _, serviceID := range serviceIDs { if _, exists := nodeTasks[serviceID]; !exists { continue } service := g.globalServices[serviceID] for nodeID, node := range g.nodes { meetsConstraints := constraint.NodeMatches(service.constraints, node) ntasks := nodeTasks[serviceID][nodeID] delete(nodeTasks[serviceID], nodeID) // if restart policy considers this node has finished its task // it should remove all running tasks if _, exists := nodeCompleted[serviceID][nodeID]; exists || !meetsConstraints { g.removeTasks(ctx, batch, ntasks) continue } if node.Spec.Availability == api.NodeAvailabilityPause { // the node is paused, so we won't add or update // any tasks continue } // this node needs to run 1 copy of the task if len(ntasks) == 0 { g.addTask(ctx, batch, service.Service, nodeID) } else { updateTasks = append(updateTasks, ntasks) } } if len(updateTasks) > 0 { g.updater.Update(ctx, g.cluster, service.Service, updateTasks) } // Remove any tasks assigned to nodes not found in g.nodes. // These must be associated with nodes that are drained, or // nodes that no longer exist. for _, ntasks := range nodeTasks[serviceID] { g.removeTasks(ctx, batch, ntasks) } } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServices transaction failed") } }
// reconcileServicesOneNode checks the specified services on one node func (g *Orchestrator) reconcileServicesOneNode(ctx context.Context, serviceIDs []string, nodeID string) { node, exists := g.nodes[nodeID] if !exists { return } // whether each service has completed on the node completed := make(map[string]bool) // tasks by service tasks := make(map[string][]*api.Task) var ( tasksOnNode []*api.Task err error ) g.store.View(func(tx store.ReadTx) { tasksOnNode, err = store.FindTasks(tx, store.ByNodeID(nodeID)) }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcile failed finding tasks on node %s", nodeID) return } for _, serviceID := range serviceIDs { for _, t := range tasksOnNode { if t.ServiceID != serviceID { continue } if isTaskRunning(t) { tasks[serviceID] = append(tasks[serviceID], t) } else { if isTaskCompleted(t, orchestrator.RestartCondition(t)) { completed[serviceID] = true } } } } _, err = g.store.Batch(func(batch *store.Batch) error { for _, serviceID := range serviceIDs { service, exists := g.globalServices[serviceID] if !exists { continue } if !constraint.NodeMatches(service.constraints, node) { continue } // if restart policy considers this node has finished its task // it should remove all running tasks if completed[serviceID] { g.removeTasks(ctx, batch, tasks[serviceID]) continue } if node.Spec.Availability == api.NodeAvailabilityPause { // the node is paused, so we won't add or update tasks continue } if len(tasks) == 0 { g.addTask(ctx, batch, service.Service, nodeID) } else { // If task is out of date, update it. This can happen // on node reconciliation if, for example, we pause a // node, update the service, and then activate the node // later. // We don't use g.updater here for two reasons: // - This is not a rolling update. Since it was not // triggered directly by updating the service, it // should not observe the rolling update parameters // or show status in UpdateStatus. // - Calling Update cancels any current rolling updates // for the service, such as one triggered by service // reconciliation. var ( dirtyTasks []*api.Task cleanTasks []*api.Task ) for _, t := range tasks[serviceID] { if orchestrator.IsTaskDirty(service.Service, t) { dirtyTasks = append(dirtyTasks, t) } else { cleanTasks = append(cleanTasks, t) } } if len(cleanTasks) == 0 { g.addTask(ctx, batch, service.Service, nodeID) } else { dirtyTasks = append(dirtyTasks, cleanTasks[1:]...) } g.removeTasks(ctx, batch, dirtyTasks) } } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServiceOneNode batch failed") } }
func (r *Supervisor) shouldRestart(ctx context.Context, t *api.Task, service *api.Service) bool { // TODO(aluzzardi): This function should not depend on `service`. condition := orchestrator.RestartCondition(t) if condition != api.RestartOnAny && (condition != api.RestartOnFailure || t.Status.State == api.TaskStateCompleted) { return false } if t.Spec.Restart == nil || t.Spec.Restart.MaxAttempts == 0 { return true } instanceTuple := instanceTuple{ instance: t.Slot, serviceID: t.ServiceID, } // Instance is not meaningful for "global" tasks, so they need to be // indexed by NodeID. if orchestrator.IsGlobalService(service) { instanceTuple.nodeID = t.NodeID } r.mu.Lock() defer r.mu.Unlock() restartInfo := r.history[instanceTuple] if restartInfo == nil { return true } if t.Spec.Restart.Window == nil || (t.Spec.Restart.Window.Seconds == 0 && t.Spec.Restart.Window.Nanos == 0) { return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts } if restartInfo.restartedInstances == nil { return true } window, err := ptypes.Duration(t.Spec.Restart.Window) if err != nil { log.G(ctx).WithError(err).Error("invalid restart lookback window") return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts } lookback := time.Now().Add(-window) var next *list.Element for e := restartInfo.restartedInstances.Front(); e != nil; e = next { next = e.Next() if e.Value.(restartedInstance).timestamp.After(lookback) { break } restartInfo.restartedInstances.Remove(e) } numRestarts := uint64(restartInfo.restartedInstances.Len()) if numRestarts == 0 { restartInfo.restartedInstances = nil } return numRestarts < t.Spec.Restart.MaxAttempts }