func (r *Orchestrator) handleTaskChange(ctx context.Context, t *api.Task) { // If we already set the desired state past TaskStateRunning, there is no // further action necessary. if t.DesiredState > api.TaskStateRunning { return } var ( n *api.Node service *api.Service ) r.store.View(func(tx store.ReadTx) { if t.NodeID != "" { n = store.GetNode(tx, t.NodeID) } if t.ServiceID != "" { service = store.GetService(tx, t.ServiceID) } }) if !orchestrator.IsReplicatedService(service) { return } if t.Status.State > api.TaskStateRunning || (t.NodeID != "" && invalidNode(n)) { r.restartTasks[t.ID] = struct{}{} } }
func (r *Orchestrator) handleServiceEvent(ctx context.Context, event events.Event) { switch v := event.(type) { case state.EventDeleteService: if !orchestrator.IsReplicatedService(v.Service) { return } orchestrator.DeleteServiceTasks(ctx, r.store, v.Service) r.restarts.ClearServiceHistory(v.Service.ID) case state.EventCreateService: if !orchestrator.IsReplicatedService(v.Service) { return } r.reconcileServices[v.Service.ID] = v.Service case state.EventUpdateService: if !orchestrator.IsReplicatedService(v.Service) { return } r.reconcileServices[v.Service.ID] = v.Service } }
func (r *Orchestrator) initServices(readTx store.ReadTx) error { services, err := store.FindServices(readTx, store.All) if err != nil { return err } for _, s := range services { if orchestrator.IsReplicatedService(s) { r.reconcileServices[s.ID] = s } } return nil }
func (r *Orchestrator) tickTasks(ctx context.Context) { if len(r.restartTasks) > 0 { _, err := r.store.Batch(func(batch *store.Batch) error { for taskID := range r.restartTasks { err := batch.Update(func(tx store.Tx) error { // TODO(aaronl): optimistic update? t := store.GetTask(tx, taskID) if t != nil { if t.DesiredState > api.TaskStateRunning { return nil } service := store.GetService(tx, t.ServiceID) if !orchestrator.IsReplicatedService(service) { return nil } // Restart task if applicable if err := r.restarts.Restart(ctx, tx, r.cluster, service, *t); err != nil { return err } } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("Orchestrator task reaping transaction failed") } } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("orchestrator task removal batch failed") } r.restartTasks = make(map[string]struct{}) } }
func (r *Orchestrator) restartTasksByNodeID(ctx context.Context, nodeID string) { var err error r.store.View(func(tx store.ReadTx) { var tasks []*api.Task tasks, err = store.FindTasks(tx, store.ByNodeID(nodeID)) if err != nil { return } for _, t := range tasks { if t.DesiredState > api.TaskStateRunning { continue } service := store.GetService(tx, t.ServiceID) if orchestrator.IsReplicatedService(service) { r.restartTasks[t.ID] = struct{}{} } } }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to list tasks to remove") } }
func (r *Orchestrator) handleTaskEvent(ctx context.Context, event events.Event) { switch v := event.(type) { case state.EventDeleteNode: r.restartTasksByNodeID(ctx, v.Node.ID) case state.EventCreateNode: r.handleNodeChange(ctx, v.Node) case state.EventUpdateNode: r.handleNodeChange(ctx, v.Node) case state.EventDeleteTask: if v.Task.DesiredState <= api.TaskStateRunning { service := r.resolveService(ctx, v.Task) if !orchestrator.IsReplicatedService(service) { return } r.reconcileServices[service.ID] = service } r.restarts.Cancel(v.Task.ID) case state.EventUpdateTask: r.handleTaskChange(ctx, v.Task) case state.EventCreateTask: r.handleTaskChange(ctx, v.Task) } }
func (r *Orchestrator) initTasks(ctx context.Context, readTx store.ReadTx) error { tasks, err := store.FindTasks(readTx, store.All) if err != nil { return err } for _, t := range tasks { if t.NodeID != "" { n := store.GetNode(readTx, t.NodeID) if invalidNode(n) && t.Status.State <= api.TaskStateRunning && t.DesiredState <= api.TaskStateRunning { r.restartTasks[t.ID] = struct{}{} } } } _, err = r.store.Batch(func(batch *store.Batch) error { for _, t := range tasks { if t.ServiceID == "" { continue } // TODO(aluzzardi): We should NOT retrieve the service here. service := store.GetService(readTx, t.ServiceID) if service == nil { // Service was deleted err := batch.Update(func(tx store.Tx) error { return store.DeleteTask(tx, t.ID) }) if err != nil { log.G(ctx).WithError(err).Error("failed to set task desired state to dead") } continue } // TODO(aluzzardi): This is shady. We should have a more generic condition. if t.DesiredState != api.TaskStateReady || !orchestrator.IsReplicatedService(service) { continue } restartDelay := orchestrator.DefaultRestartDelay if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil { var err error restartDelay, err = gogotypes.DurationFromProto(t.Spec.Restart.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid restart delay") restartDelay = orchestrator.DefaultRestartDelay } } if restartDelay != 0 { timestamp, err := gogotypes.TimestampFromProto(t.Status.Timestamp) if err == nil { restartTime := timestamp.Add(restartDelay) calculatedRestartDelay := restartTime.Sub(time.Now()) if calculatedRestartDelay < restartDelay { restartDelay = calculatedRestartDelay } if restartDelay > 0 { _ = batch.Update(func(tx store.Tx) error { t := store.GetTask(tx, t.ID) // TODO(aluzzardi): This is shady as well. We should have a more generic condition. if t == nil || t.DesiredState != api.TaskStateReady { return nil } r.restarts.DelayStart(ctx, tx, nil, t.ID, restartDelay, true) return nil }) continue } } else { log.G(ctx).WithError(err).Error("invalid status timestamp") } } // Start now err := batch.Update(func(tx store.Tx) error { return r.restarts.StartNow(tx, t.ID) }) if err != nil { log.G(ctx).WithError(err).WithField("task.id", t.ID).Error("moving task out of delayed state failed") } } return nil }) return err }
// Restart initiates a new task to replace t if appropriate under the service's // restart policy. func (r *Supervisor) Restart(ctx context.Context, tx store.Tx, cluster *api.Cluster, service *api.Service, t api.Task) error { // TODO(aluzzardi): This function should not depend on `service`. // Is the old task still in the process of restarting? If so, wait for // its restart delay to elapse, to avoid tight restart loops (for // example, when the image doesn't exist). r.mu.Lock() oldDelay, ok := r.delays[t.ID] if ok { if !oldDelay.waiter { oldDelay.waiter = true go r.waitRestart(ctx, oldDelay, cluster, t.ID) } r.mu.Unlock() return nil } r.mu.Unlock() // Sanity check: was the task shut down already by a separate call to // Restart? If so, we must avoid restarting it, because this will create // an extra task. This should never happen unless there is a bug. if t.DesiredState > api.TaskStateRunning { return errors.New("Restart called on task that was already shut down") } t.DesiredState = api.TaskStateShutdown err := store.UpdateTask(tx, &t) if err != nil { log.G(ctx).WithError(err).Errorf("failed to set task desired state to dead") return err } if !r.shouldRestart(ctx, &t, service) { return nil } var restartTask *api.Task if orchestrator.IsReplicatedService(service) { restartTask = orchestrator.NewTask(cluster, service, t.Slot, "") } else if orchestrator.IsGlobalService(service) { restartTask = orchestrator.NewTask(cluster, service, 0, t.NodeID) } else { log.G(ctx).Error("service not supported by restart supervisor") return nil } n := store.GetNode(tx, t.NodeID) restartTask.DesiredState = api.TaskStateReady var restartDelay time.Duration // Restart delay is not applied to drained nodes if n == nil || n.Spec.Availability != api.NodeAvailabilityDrain { if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil { var err error restartDelay, err = ptypes.Duration(t.Spec.Restart.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid restart delay; using default") restartDelay = orchestrator.DefaultRestartDelay } } else { restartDelay = orchestrator.DefaultRestartDelay } } waitStop := true // Normally we wait for the old task to stop running, but we skip this // if the old task is already dead or the node it's assigned to is down. if (n != nil && n.Status.State == api.NodeStatus_DOWN) || t.Status.State > api.TaskStateRunning { waitStop = false } if err := store.CreateTask(tx, restartTask); err != nil { log.G(ctx).WithError(err).WithField("task.id", restartTask.ID).Error("task create failed") return err } r.recordRestartHistory(restartTask) r.DelayStart(ctx, tx, &t, restartTask.ID, restartDelay, waitStop) return nil }