func restartPolicyFromGRPC(p *swarmapi.RestartPolicy) *types.RestartPolicy { var rp *types.RestartPolicy if p != nil { rp = &types.RestartPolicy{} switch p.Condition { case swarmapi.RestartOnNone: rp.Condition = types.RestartPolicyConditionNone case swarmapi.RestartOnFailure: rp.Condition = types.RestartPolicyConditionOnFailure case swarmapi.RestartOnAny: rp.Condition = types.RestartPolicyConditionAny default: rp.Condition = types.RestartPolicyConditionAny } if p.Delay != nil { delay, _ := ptypes.Duration(p.Delay) rp.Delay = &delay } if p.Window != nil { window, _ := ptypes.Duration(p.Window) rp.Window = &window } rp.MaxAttempts = &p.MaxAttempts } return rp }
func validateRestartPolicy(rp *api.RestartPolicy) error { if rp == nil { return nil } if rp.Delay != nil { delay, err := ptypes.Duration(rp.Delay) if err != nil { return err } if delay < 0 { return grpc.Errorf(codes.InvalidArgument, "TaskSpec: restart-delay cannot be negative") } } if rp.Window != nil { win, err := ptypes.Duration(rp.Window) if err != nil { return err } if win < 0 { return grpc.Errorf(codes.InvalidArgument, "TaskSpec: restart-window cannot be negative") } } return nil }
func printClusterSummary(cluster *api.Cluster) { w := tabwriter.NewWriter(os.Stdout, 8, 8, 8, ' ', 0) defer w.Flush() common.FprintfIfNotEmpty(w, "ID\t: %s\n", cluster.ID) common.FprintfIfNotEmpty(w, "Name\t: %s\n", cluster.Spec.Annotations.Name) fmt.Fprintf(w, "Orchestration settings:\n") fmt.Fprintf(w, " Task history entries: %d\n", cluster.Spec.Orchestration.TaskHistoryRetentionLimit) heartbeatPeriod, err := ptypes.Duration(cluster.Spec.Dispatcher.HeartbeatPeriod) if err == nil { fmt.Fprintf(w, "Dispatcher settings:\n") fmt.Fprintf(w, " Dispatcher heartbeat period: %s\n", heartbeatPeriod.String()) } fmt.Fprintf(w, "Certificate Authority settings:\n") if cluster.Spec.CAConfig.NodeCertExpiry != nil { clusterDuration, err := ptypes.Duration(cluster.Spec.CAConfig.NodeCertExpiry) if err != nil { fmt.Fprintf(w, " Certificate Validity Duration: [ERROR PARSING DURATION]\n") } else { fmt.Fprintf(w, " Certificate Validity Duration: %s\n", clusterDuration.String()) } } if len(cluster.Spec.CAConfig.ExternalCAs) > 0 { fmt.Fprintf(w, " External CAs:\n") for _, ca := range cluster.Spec.CAConfig.ExternalCAs { fmt.Fprintf(w, " %s: %s\n", ca.Protocol, ca.URL) } } fmt.Fprintln(w, " Join Tokens:") fmt.Fprintln(w, " Worker:", cluster.RootCA.JoinTokens.Worker) fmt.Fprintln(w, " Manager:", cluster.RootCA.JoinTokens.Manager) if cluster.Spec.TaskDefaults.LogDriver != nil { fmt.Fprintf(w, "Default Log Driver\t: %s\n", cluster.Spec.TaskDefaults.LogDriver.Name) var keys []string if len(cluster.Spec.TaskDefaults.LogDriver.Options) != 0 { for k := range cluster.Spec.TaskDefaults.LogDriver.Options { keys = append(keys, k) } sort.Strings(keys) for _, k := range keys { v := cluster.Spec.TaskDefaults.LogDriver.Options[k] if v != "" { fmt.Fprintf(w, " %s\t: %s\n", k, v) } else { fmt.Fprintf(w, " %s\t\n", k) } } } } }
func healthConfigFromGRPC(h *swarmapi.HealthConfig) *container.HealthConfig { interval, _ := ptypes.Duration(h.Interval) timeout, _ := ptypes.Duration(h.Timeout) return &container.HealthConfig{ Test: h.Test, Interval: interval, Timeout: timeout, Retries: int(h.Retries), } }
// SwarmFromGRPC converts a grpc Cluster to a Swarm. func SwarmFromGRPC(c swarmapi.Cluster) types.Swarm { swarm := types.Swarm{ ID: c.ID, Spec: types.Spec{ Orchestration: types.OrchestrationConfig{ TaskHistoryRetentionLimit: c.Spec.Orchestration.TaskHistoryRetentionLimit, }, Raft: types.RaftConfig{ SnapshotInterval: c.Spec.Raft.SnapshotInterval, KeepOldSnapshots: c.Spec.Raft.KeepOldSnapshots, LogEntriesForSlowFollowers: c.Spec.Raft.LogEntriesForSlowFollowers, HeartbeatTick: c.Spec.Raft.HeartbeatTick, ElectionTick: c.Spec.Raft.ElectionTick, }, }, } heartbeatPeriod, _ := ptypes.Duration(c.Spec.Dispatcher.HeartbeatPeriod) swarm.Spec.Dispatcher.HeartbeatPeriod = uint64(heartbeatPeriod) swarm.Spec.CAConfig.NodeCertExpiry, _ = ptypes.Duration(c.Spec.CAConfig.NodeCertExpiry) for _, ca := range c.Spec.CAConfig.ExternalCAs { swarm.Spec.CAConfig.ExternalCAs = append(swarm.Spec.CAConfig.ExternalCAs, &types.ExternalCA{ Protocol: types.ExternalCAProtocol(strings.ToLower(ca.Protocol.String())), URL: ca.URL, Options: ca.Options, }) } // Meta swarm.Version.Index = c.Meta.Version.Index swarm.CreatedAt, _ = ptypes.Timestamp(c.Meta.CreatedAt) swarm.UpdatedAt, _ = ptypes.Timestamp(c.Meta.UpdatedAt) // Annotations swarm.Spec.Name = c.Spec.Annotations.Name swarm.Spec.Labels = c.Spec.Annotations.Labels for _, policy := range c.Spec.AcceptancePolicy.Policies { p := types.Policy{ Role: types.NodeRole(strings.ToLower(policy.Role.String())), Autoaccept: policy.Autoaccept, } if policy.Secret != nil { secret := string(policy.Secret.Data) p.Secret = &secret } swarm.Spec.AcceptancePolicy.Policies = append(swarm.Spec.AcceptancePolicy.Policies, p) } return swarm }
func (c *containerConfig) healthcheck() *enginecontainer.HealthConfig { hcSpec := c.spec().Healthcheck if hcSpec == nil { return nil } interval, _ := ptypes.Duration(hcSpec.Interval) timeout, _ := ptypes.Duration(hcSpec.Timeout) return &enginecontainer.HealthConfig{ Test: hcSpec.Test, Interval: interval, Timeout: timeout, Retries: int(hcSpec.Retries), } }
// SwarmFromGRPC converts a grpc Cluster to a Swarm. func SwarmFromGRPC(c swarmapi.Cluster) types.Swarm { swarm := types.Swarm{ ClusterInfo: types.ClusterInfo{ ID: c.ID, Spec: types.Spec{ Orchestration: types.OrchestrationConfig{ TaskHistoryRetentionLimit: &c.Spec.Orchestration.TaskHistoryRetentionLimit, }, Raft: types.RaftConfig{ SnapshotInterval: c.Spec.Raft.SnapshotInterval, KeepOldSnapshots: &c.Spec.Raft.KeepOldSnapshots, LogEntriesForSlowFollowers: c.Spec.Raft.LogEntriesForSlowFollowers, HeartbeatTick: int(c.Spec.Raft.HeartbeatTick), ElectionTick: int(c.Spec.Raft.ElectionTick), }, EncryptionConfig: types.EncryptionConfig{ AutoLockManagers: c.Spec.EncryptionConfig.AutoLockManagers, }, }, }, JoinTokens: types.JoinTokens{ Worker: c.RootCA.JoinTokens.Worker, Manager: c.RootCA.JoinTokens.Manager, }, } heartbeatPeriod, _ := ptypes.Duration(c.Spec.Dispatcher.HeartbeatPeriod) swarm.Spec.Dispatcher.HeartbeatPeriod = heartbeatPeriod swarm.Spec.CAConfig.NodeCertExpiry, _ = ptypes.Duration(c.Spec.CAConfig.NodeCertExpiry) for _, ca := range c.Spec.CAConfig.ExternalCAs { swarm.Spec.CAConfig.ExternalCAs = append(swarm.Spec.CAConfig.ExternalCAs, &types.ExternalCA{ Protocol: types.ExternalCAProtocol(strings.ToLower(ca.Protocol.String())), URL: ca.URL, Options: ca.Options, }) } // Meta swarm.Version.Index = c.Meta.Version.Index swarm.CreatedAt, _ = ptypes.Timestamp(c.Meta.CreatedAt) swarm.UpdatedAt, _ = ptypes.Timestamp(c.Meta.UpdatedAt) // Annotations swarm.Spec.Name = c.Spec.Annotations.Name swarm.Spec.Labels = c.Spec.Annotations.Labels return swarm }
func validateClusterSpec(spec *api.ClusterSpec) error { if spec == nil { return grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } // Validate that duration being provided is valid, and over our minimum if spec.CAConfig.NodeCertExpiry != nil { expiry, err := ptypes.Duration(spec.CAConfig.NodeCertExpiry) if err != nil { return grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } if expiry < ca.MinNodeCertExpiration { return grpc.Errorf(codes.InvalidArgument, "minimum certificate expiry time is: %s", ca.MinNodeCertExpiration) } } // Validate that AcceptancePolicies only include Secrets that are bcrypted // TODO(diogo): Add a global list of acceptace algorithms. We only support bcrypt for now. if len(spec.AcceptancePolicy.Policies) > 0 { for _, policy := range spec.AcceptancePolicy.Policies { if policy.Secret != nil && strings.ToLower(policy.Secret.Alg) != "bcrypt" { return grpc.Errorf(codes.InvalidArgument, "hashing algorithm is not supported: %s", policy.Secret.Alg) } } } return nil }
func printClusterSummary(cluster *api.Cluster) { w := tabwriter.NewWriter(os.Stdout, 8, 8, 8, ' ', 0) defer w.Flush() common.FprintfIfNotEmpty(w, "ID\t: %s\n", cluster.ID) common.FprintfIfNotEmpty(w, "Name\t: %s\n", cluster.Spec.Annotations.Name) if len(cluster.Spec.AcceptancePolicy.Policies) > 0 { fmt.Fprintf(w, "Acceptance Policies:\n") for _, policy := range cluster.Spec.AcceptancePolicy.Policies { fmt.Fprintf(w, " Role\t: %v\n", policy.Role) fmt.Fprintf(w, " Autoaccept\t: %v\n", policy.Autoaccept) if policy.Secret != nil { fmt.Fprintf(w, " Secret\t: %v\n", string(policy.Secret.Data)) } } } fmt.Fprintf(w, "Orchestration settings:\n") fmt.Fprintf(w, " Task history entries: %d\n", cluster.Spec.Orchestration.TaskHistoryRetentionLimit) fmt.Fprintf(w, "Dispatcher settings:\n") fmt.Fprintf(w, " Dispatcher heartbeat period: %d\n", cluster.Spec.Dispatcher.HeartbeatPeriod) if cluster.Spec.CAConfig.NodeCertExpiry != nil { fmt.Fprintf(w, "Certificate Authority settings:\n") clusterDuration, err := ptypes.Duration(cluster.Spec.CAConfig.NodeCertExpiry) if err != nil { fmt.Fprintf(w, " Certificate Validity Duration: [ERROR PARSING DURATION]\n") } else { fmt.Fprintf(w, " Certificate Validity Duration: %s\n", clusterDuration.String()) } } }
// updateCluster is called when there are cluster changes, and it ensures that the local RootCA is // always aware of changes in clusterExpiry and the Root CA key material func (s *Server) updateCluster(ctx context.Context, cluster *api.Cluster) { s.mu.Lock() s.joinTokens = cluster.RootCA.JoinTokens.Copy() s.mu.Unlock() var err error // If the cluster has a RootCA, let's try to update our SecurityConfig to reflect the latest values rCA := cluster.RootCA if len(rCA.CACert) != 0 && len(rCA.CAKey) != 0 { expiry := DefaultNodeCertExpiration if cluster.Spec.CAConfig.NodeCertExpiry != nil { // NodeCertExpiry exists, let's try to parse the duration out of it clusterExpiry, err := ptypes.Duration(cluster.Spec.CAConfig.NodeCertExpiry) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).WithError(err).Warn("failed to parse certificate expiration, using default") } else { // We were able to successfully parse the expiration out of the cluster. expiry = clusterExpiry } } else { // NodeCertExpiry seems to be nil log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).WithError(err).Warn("failed to parse certificate expiration, using default") } // Attempt to update our local RootCA with the new parameters err = s.securityConfig.UpdateRootCA(rCA.CACert, rCA.CAKey, expiry) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).WithError(err).Error("updating Root CA failed") } else { log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).Debugf("Root CA updated successfully") } } // Update our security config with the list of External CA URLs // from the new cluster state. // TODO(aaronl): In the future, this will be abstracted with an // ExternalCA interface that has different implementations for // different CA types. At the moment, only CFSSL is supported. var cfsslURLs []string for _, ca := range cluster.Spec.CAConfig.ExternalCAs { if ca.Protocol == api.ExternalCA_CAProtocolCFSSL { cfsslURLs = append(cfsslURLs, ca.URL) } } s.securityConfig.externalCA.UpdateURLs(cfsslURLs...) }
func (s *session) heartbeat(ctx context.Context) error { log.G(ctx).Debugf("(*session).heartbeat") client := api.NewDispatcherClient(s.conn) heartbeat := time.NewTimer(1) // send out a heartbeat right away defer heartbeat.Stop() for { select { case <-heartbeat.C: heartbeatCtx, cancel := context.WithTimeout(ctx, dispatcherRPCTimeout) resp, err := client.Heartbeat(heartbeatCtx, &api.HeartbeatRequest{ SessionID: s.sessionID, }) cancel() if err != nil { if grpc.Code(err) == codes.NotFound { err = errNodeNotRegistered } return err } period, err := ptypes.Duration(&resp.Period) if err != nil { return err } heartbeat.Reset(period) case <-s.closed: return errSessionClosed case <-ctx.Done(): return ctx.Err() } } }
func (u *Updater) worker(ctx context.Context, queue <-chan *api.Task) { for t := range queue { updated := newTask(u.cluster, u.newService, t.Slot) updated.DesiredState = api.TaskStateReady if isGlobalService(u.newService) { updated.NodeID = t.NodeID } if err := u.updateTask(ctx, t, updated); err != nil { log.G(ctx).WithError(err).WithField("task.id", t.ID).Error("update failed") } if u.newService.Spec.Update != nil && (u.newService.Spec.Update.Delay.Seconds != 0 || u.newService.Spec.Update.Delay.Nanos != 0) { delay, err := ptypes.Duration(&u.newService.Spec.Update.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid update delay") continue } select { case <-time.After(delay): case <-u.stopChan: return } } } }
func restartPolicyFromGRPC(p *swarmapi.RestartPolicy) *types.RestartPolicy { var rp *types.RestartPolicy if p != nil { rp = &types.RestartPolicy{} rp.Condition = types.RestartPolicyCondition(strings.ToLower(p.Condition.String())) if p.Delay != nil { delay, _ := ptypes.Duration(p.Delay) rp.Delay = &delay } if p.Window != nil { window, _ := ptypes.Duration(p.Window) rp.Window = &window } rp.MaxAttempts = &p.MaxAttempts } return rp }
func (c *containerAdapter) shutdown(ctx context.Context) error { // Default stop grace period to 10s. stopgrace := 10 * time.Second spec := c.container.spec() if spec.StopGracePeriod != nil { stopgrace, _ = ptypes.Duration(spec.StopGracePeriod) } return c.client.ContainerStop(ctx, c.container.name(), &stopgrace) }
// ServiceFromGRPC converts a grpc Service to a Service. func ServiceFromGRPC(s swarmapi.Service) types.Service { spec := s.Spec containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container networks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks)) for _, n := range spec.Networks { networks = append(networks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) } service := types.Service{ ID: s.ID, Spec: types.ServiceSpec{ TaskTemplate: types.TaskSpec{ ContainerSpec: containerSpecFromGRPC(containerConfig), Resources: resourcesFromGRPC(s.Spec.Task.Resources), RestartPolicy: restartPolicyFromGRPC(s.Spec.Task.Restart), Placement: placementFromGRPC(s.Spec.Task.Placement), LogDriver: driverFromGRPC(s.Spec.Task.LogDriver), }, Networks: networks, EndpointSpec: endpointSpecFromGRPC(s.Spec.Endpoint), }, Endpoint: endpointFromGRPC(s.Endpoint), } // Meta service.Version.Index = s.Meta.Version.Index service.CreatedAt, _ = ptypes.Timestamp(s.Meta.CreatedAt) service.UpdatedAt, _ = ptypes.Timestamp(s.Meta.UpdatedAt) // Annotations service.Spec.Name = s.Spec.Annotations.Name service.Spec.Labels = s.Spec.Annotations.Labels // UpdateConfig if s.Spec.Update != nil { service.Spec.UpdateConfig = &types.UpdateConfig{ Parallelism: s.Spec.Update.Parallelism, } service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay) } //Mode switch t := s.Spec.GetMode().(type) { case *swarmapi.ServiceSpec_Global: service.Spec.Mode.Global = &types.GlobalService{} case *swarmapi.ServiceSpec_Replicated: service.Spec.Mode.Replicated = &types.ReplicatedService{ Replicas: &t.Replicated.Replicas, } } return service }
func (u *Updater) worker(ctx context.Context, queue <-chan slot) { for slot := range queue { // Do we have a task with the new spec in desired state = RUNNING? // If so, all we have to do to complete the update is remove the // other tasks. Or if we have a task with the new spec that has // desired state < RUNNING, advance it to running and remove the // other tasks. var ( runningTask *api.Task cleanTask *api.Task ) for _, t := range slot { if !u.isTaskDirty(t) { if t.DesiredState == api.TaskStateRunning { runningTask = t break } if t.DesiredState < api.TaskStateRunning { cleanTask = t } } } if runningTask != nil { if err := u.useExistingTask(ctx, slot, runningTask); err != nil { log.G(ctx).WithError(err).Error("update failed") } } else if cleanTask != nil { if err := u.useExistingTask(ctx, slot, cleanTask); err != nil { log.G(ctx).WithError(err).Error("update failed") } } else { updated := newTask(u.cluster, u.newService, slot[0].Slot) updated.DesiredState = api.TaskStateReady if isGlobalService(u.newService) { updated.NodeID = slot[0].NodeID } if err := u.updateTask(ctx, slot, updated); err != nil { log.G(ctx).WithError(err).WithField("task.id", updated.ID).Error("update failed") } } if u.newService.Spec.Update != nil && (u.newService.Spec.Update.Delay.Seconds != 0 || u.newService.Spec.Update.Delay.Nanos != 0) { delay, err := ptypes.Duration(&u.newService.Spec.Update.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid update delay") continue } select { case <-time.After(delay): case <-u.stopChan: return } } } }
func containerSpecFromGRPC(c *swarmapi.ContainerSpec) types.ContainerSpec { containerSpec := types.ContainerSpec{ Image: c.Image, Labels: c.Labels, Command: c.Command, Args: c.Args, Hostname: c.Hostname, Env: c.Env, Dir: c.Dir, User: c.User, Groups: c.Groups, TTY: c.TTY, } // Mounts for _, m := range c.Mounts { mount := mounttypes.Mount{ Target: m.Target, Source: m.Source, Type: mounttypes.Type(strings.ToLower(swarmapi.Mount_MountType_name[int32(m.Type)])), ReadOnly: m.ReadOnly, } if m.BindOptions != nil { mount.BindOptions = &mounttypes.BindOptions{ Propagation: mounttypes.Propagation(strings.ToLower(swarmapi.Mount_BindOptions_MountPropagation_name[int32(m.BindOptions.Propagation)])), } } if m.VolumeOptions != nil { mount.VolumeOptions = &mounttypes.VolumeOptions{ NoCopy: m.VolumeOptions.NoCopy, Labels: m.VolumeOptions.Labels, } if m.VolumeOptions.DriverConfig != nil { mount.VolumeOptions.DriverConfig = &mounttypes.Driver{ Name: m.VolumeOptions.DriverConfig.Name, Options: m.VolumeOptions.DriverConfig.Options, } } } containerSpec.Mounts = append(containerSpec.Mounts, mount) } if c.StopGracePeriod != nil { grace, _ := ptypes.Duration(c.StopGracePeriod) containerSpec.StopGracePeriod = &grace } if c.Healthcheck != nil { containerSpec.Healthcheck = healthConfigFromGRPC(c.Healthcheck) } return containerSpec }
func validateUpdate(uc *api.UpdateConfig) error { if uc == nil { return nil } delay, err := ptypes.Duration(&uc.Delay) if err != nil { return err } if delay < 0 { return grpc.Errorf(codes.InvalidArgument, "TaskSpec: update-delay cannot be negative") } return nil }
func containerSpecFromGRPC(c *swarmapi.ContainerSpec) types.ContainerSpec { containerSpec := types.ContainerSpec{ Image: c.Image, Labels: c.Labels, Command: c.Command, Args: c.Args, Env: c.Env, Dir: c.Dir, User: c.User, } // Mounts for _, m := range c.Mounts { mount := types.Mount{ Target: m.Target, Source: m.Source, Type: types.MountType(strings.ToLower(swarmapi.Mount_MountType_name[int32(m.Type)])), Writable: m.Writable, } if m.BindOptions != nil { mount.BindOptions = &types.BindOptions{ Propagation: types.MountPropagation(strings.ToLower(swarmapi.Mount_BindOptions_MountPropagation_name[int32(m.BindOptions.Propagation)])), } } if m.VolumeOptions != nil { mount.VolumeOptions = &types.VolumeOptions{ Populate: m.VolumeOptions.Populate, Labels: m.VolumeOptions.Labels, } if m.VolumeOptions.DriverConfig != nil { mount.VolumeOptions.DriverConfig = &types.Driver{ Name: m.VolumeOptions.DriverConfig.Name, Options: m.VolumeOptions.DriverConfig.Options, } } } containerSpec.Mounts = append(containerSpec.Mounts, mount) } if c.StopGracePeriod != nil { grace, _ := ptypes.Duration(c.StopGracePeriod) containerSpec.StopGracePeriod = &grace } return containerSpec }
// updateCluster is called when there are cluster changes, and it ensures that the local RootCA is // always aware of changes in clusterExpiry and the Root CA key material func (s *Server) updateCluster(ctx context.Context, cluster *api.Cluster) { s.mu.Lock() s.acceptancePolicy = cluster.Spec.AcceptancePolicy.Copy() s.mu.Unlock() var err error // If the cluster has a RootCA, let's try to update our SecurityConfig to reflect the latest values rCA := cluster.RootCA if len(rCA.CACert) != 0 && len(rCA.CAKey) != 0 { expiry := DefaultNodeCertExpiration if cluster.Spec.CAConfig.NodeCertExpiry != nil { // NodeCertExpiry exists, let's try to parse the duration out of it clusterExpiry, err := ptypes.Duration(cluster.Spec.CAConfig.NodeCertExpiry) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).WithError(err).Warn("failed to parse certificate expiration, using default") } else { // We were able to successfully parse the expiration out of the cluster. expiry = clusterExpiry } } else { // NodeCertExpiry seems to be nil log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).WithError(err).Warn("failed to parse certificate expiration, using default") } // Attempt to update our local RootCA with the new parameters err = s.securityConfig.UpdateRootCA(rCA.CACert, rCA.CAKey, expiry) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).WithError(err).Error("updating Root CA failed") } else { log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).Debugf("Root CA updated successfully") } } }
func serviceSpecFromGRPC(spec *swarmapi.ServiceSpec) *types.ServiceSpec { if spec == nil { return nil } serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks)) for _, n := range spec.Networks { serviceNetworks = append(serviceNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) } taskNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Task.Networks)) for _, n := range spec.Task.Networks { taskNetworks = append(taskNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) } containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container convertedSpec := &types.ServiceSpec{ Annotations: types.Annotations{ Name: spec.Annotations.Name, Labels: spec.Annotations.Labels, }, TaskTemplate: types.TaskSpec{ ContainerSpec: containerSpecFromGRPC(containerConfig), Resources: resourcesFromGRPC(spec.Task.Resources), RestartPolicy: restartPolicyFromGRPC(spec.Task.Restart), Placement: placementFromGRPC(spec.Task.Placement), LogDriver: driverFromGRPC(spec.Task.LogDriver), Networks: taskNetworks, ForceUpdate: spec.Task.ForceUpdate, }, Networks: serviceNetworks, EndpointSpec: endpointSpecFromGRPC(spec.Endpoint), } // UpdateConfig if spec.Update != nil { convertedSpec.UpdateConfig = &types.UpdateConfig{ Parallelism: spec.Update.Parallelism, MaxFailureRatio: spec.Update.MaxFailureRatio, } convertedSpec.UpdateConfig.Delay, _ = ptypes.Duration(&spec.Update.Delay) if spec.Update.Monitor != nil { convertedSpec.UpdateConfig.Monitor, _ = ptypes.Duration(spec.Update.Monitor) } switch spec.Update.FailureAction { case swarmapi.UpdateConfig_PAUSE: convertedSpec.UpdateConfig.FailureAction = types.UpdateFailureActionPause case swarmapi.UpdateConfig_CONTINUE: convertedSpec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue } } // Mode switch t := spec.GetMode().(type) { case *swarmapi.ServiceSpec_Global: convertedSpec.Mode.Global = &types.GlobalService{} case *swarmapi.ServiceSpec_Replicated: convertedSpec.Mode.Replicated = &types.ReplicatedService{ Replicas: &t.Replicated.Replicas, } } return convertedSpec }
// ServiceFromGRPC converts a grpc Service to a Service. func ServiceFromGRPC(s swarmapi.Service) types.Service { spec := s.Spec containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks)) for _, n := range spec.Networks { serviceNetworks = append(serviceNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) } taskNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Task.Networks)) for _, n := range spec.Task.Networks { taskNetworks = append(taskNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) } service := types.Service{ ID: s.ID, Spec: types.ServiceSpec{ TaskTemplate: types.TaskSpec{ ContainerSpec: containerSpecFromGRPC(containerConfig), Resources: resourcesFromGRPC(s.Spec.Task.Resources), RestartPolicy: restartPolicyFromGRPC(s.Spec.Task.Restart), Placement: placementFromGRPC(s.Spec.Task.Placement), LogDriver: driverFromGRPC(s.Spec.Task.LogDriver), Networks: taskNetworks, }, Networks: serviceNetworks, EndpointSpec: endpointSpecFromGRPC(s.Spec.Endpoint), }, Endpoint: endpointFromGRPC(s.Endpoint), } // Meta service.Version.Index = s.Meta.Version.Index service.CreatedAt, _ = ptypes.Timestamp(s.Meta.CreatedAt) service.UpdatedAt, _ = ptypes.Timestamp(s.Meta.UpdatedAt) // Annotations service.Spec.Name = s.Spec.Annotations.Name service.Spec.Labels = s.Spec.Annotations.Labels // UpdateConfig if s.Spec.Update != nil { service.Spec.UpdateConfig = &types.UpdateConfig{ Parallelism: s.Spec.Update.Parallelism, } service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay) switch s.Spec.Update.FailureAction { case swarmapi.UpdateConfig_PAUSE: service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionPause case swarmapi.UpdateConfig_CONTINUE: service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue } } // Mode switch t := s.Spec.GetMode().(type) { case *swarmapi.ServiceSpec_Global: service.Spec.Mode.Global = &types.GlobalService{} case *swarmapi.ServiceSpec_Replicated: service.Spec.Mode.Replicated = &types.ReplicatedService{ Replicas: &t.Replicated.Replicas, } } // UpdateStatus service.UpdateStatus = types.UpdateStatus{} if s.UpdateStatus != nil { switch s.UpdateStatus.State { case swarmapi.UpdateStatus_UPDATING: service.UpdateStatus.State = types.UpdateStateUpdating case swarmapi.UpdateStatus_PAUSED: service.UpdateStatus.State = types.UpdateStatePaused case swarmapi.UpdateStatus_COMPLETED: service.UpdateStatus.State = types.UpdateStateCompleted } service.UpdateStatus.StartedAt, _ = ptypes.Timestamp(s.UpdateStatus.StartedAt) service.UpdateStatus.CompletedAt, _ = ptypes.Timestamp(s.UpdateStatus.CompletedAt) service.UpdateStatus.Message = s.UpdateStatus.Message } return service }
// Run starts the update and returns only once its complete or cancelled. func (u *Updater) Run(ctx context.Context, slots []slot) { defer close(u.doneChan) service := u.newService // If the update is in a PAUSED state, we should not do anything. if service.UpdateStatus != nil && (service.UpdateStatus.State == api.UpdateStatus_PAUSED || service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_PAUSED) { return } var dirtySlots []slot for _, slot := range slots { if u.isSlotDirty(slot) { dirtySlots = append(dirtySlots, slot) } } // Abort immediately if all tasks are clean. if len(dirtySlots) == 0 { if service.UpdateStatus != nil && (service.UpdateStatus.State == api.UpdateStatus_UPDATING || service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED) { u.completeUpdate(ctx, service.ID) } return } // If there's no update in progress, we are starting one. if service.UpdateStatus == nil { u.startUpdate(ctx, service.ID) } parallelism := 0 if service.Spec.Update != nil { parallelism = int(service.Spec.Update.Parallelism) } if parallelism == 0 { // TODO(aluzzardi): We could try to optimize unlimited parallelism by performing updates in a single // goroutine using a batch transaction. parallelism = len(dirtySlots) } // Start the workers. slotQueue := make(chan slot) wg := sync.WaitGroup{} wg.Add(parallelism) for i := 0; i < parallelism; i++ { go func() { u.worker(ctx, slotQueue) wg.Done() }() } failureAction := api.UpdateConfig_PAUSE allowedFailureFraction := float32(0) monitoringPeriod := defaultMonitor if service.Spec.Update != nil { failureAction = service.Spec.Update.FailureAction allowedFailureFraction = service.Spec.Update.AllowedFailureFraction if service.Spec.Update.Monitor != nil { var err error monitoringPeriod, err = ptypes.Duration(service.Spec.Update.Monitor) if err != nil { monitoringPeriod = defaultMonitor } } } var failedTaskWatch chan events.Event if failureAction != api.UpdateConfig_CONTINUE { var cancelWatch func() failedTaskWatch, cancelWatch = state.Watch( u.store.WatchQueue(), state.EventUpdateTask{ Task: &api.Task{ServiceID: service.ID, Status: api.TaskStatus{State: api.TaskStateRunning}}, Checks: []state.TaskCheckFunc{state.TaskCheckServiceID, state.TaskCheckStateGreaterThan}, }, ) defer cancelWatch() } stopped := false failedTasks := make(map[string]struct{}) totalFailures := 0 failureTriggersAction := func(failedTask *api.Task) bool { // Ignore tasks we have already seen as failures. if _, found := failedTasks[failedTask.ID]; found { return false } // If this failed/completed task is one that we // created as part of this update, we should // follow the failure action. u.updatedTasksMu.Lock() startedAt, found := u.updatedTasks[failedTask.ID] u.updatedTasksMu.Unlock() if found && (startedAt.IsZero() || time.Since(startedAt) <= monitoringPeriod) { failedTasks[failedTask.ID] = struct{}{} totalFailures++ if float32(totalFailures)/float32(len(dirtySlots)) > allowedFailureFraction { switch failureAction { case api.UpdateConfig_PAUSE: stopped = true message := fmt.Sprintf("update paused due to failure or early termination of task %s", failedTask.ID) u.pauseUpdate(ctx, service.ID, message) return true case api.UpdateConfig_ROLLBACK: // Never roll back a rollback if service.UpdateStatus != nil && service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { message := fmt.Sprintf("rollback paused due to failure or early termination of task %s", failedTask.ID) u.pauseUpdate(ctx, service.ID, message) return true } stopped = true message := fmt.Sprintf("update rolled back due to failure or early termination of task %s", failedTask.ID) u.rollbackUpdate(ctx, service.ID, message) return true } } } return false } slotsLoop: for _, slot := range dirtySlots { retryLoop: for { // Wait for a worker to pick up the task or abort the update, whichever comes first. select { case <-u.stopChan: stopped = true break slotsLoop case ev := <-failedTaskWatch: if failureTriggersAction(ev.(state.EventUpdateTask).Task) { break slotsLoop } case slotQueue <- slot: break retryLoop } } } close(slotQueue) wg.Wait() if !stopped { // Keep watching for task failures for one more monitoringPeriod, // before declaring the update complete. doneMonitoring := time.After(monitoringPeriod) monitorLoop: for { select { case <-u.stopChan: stopped = true break monitorLoop case <-doneMonitoring: break monitorLoop case ev := <-failedTaskWatch: if failureTriggersAction(ev.(state.EventUpdateTask).Task) { break monitorLoop } } } } // TODO(aaronl): Potentially roll back the service if not enough tasks // have reached RUNNING by this point. if !stopped { u.completeUpdate(ctx, service.ID) } }
// Restart initiates a new task to replace t if appropriate under the service's // restart policy. func (r *RestartSupervisor) Restart(ctx context.Context, tx store.Tx, service *api.Service, t api.Task) error { // TODO(aluzzardi): This function should not depend on `service`. t.DesiredState = api.TaskStateShutdown err := store.UpdateTask(tx, &t) if err != nil { log.G(ctx).WithError(err).Errorf("failed to set task desired state to dead") return err } if !r.shouldRestart(ctx, &t, service) { return nil } var restartTask *api.Task if isReplicatedService(service) { restartTask = newTask(service, t.Slot) } else if isGlobalService(service) { restartTask = newTask(service, 0) restartTask.NodeID = t.NodeID } else { log.G(ctx).Error("service not supported by restart supervisor") return nil } n := store.GetNode(tx, t.NodeID) restartTask.DesiredState = api.TaskStateAccepted var restartDelay time.Duration // Restart delay does not applied to drained nodes if n == nil || n.Spec.Availability != api.NodeAvailabilityDrain { if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil { var err error restartDelay, err = ptypes.Duration(t.Spec.Restart.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid restart delay; using default") restartDelay = defaultRestartDelay } } else { restartDelay = defaultRestartDelay } } waitStop := true // Normally we wait for the old task to stop running, but we skip this // if the old task is already dead or the node it's assigned to is down. if (n != nil && n.Status.State == api.NodeStatus_DOWN) || t.Status.State > api.TaskStateRunning { waitStop = false } if err := store.CreateTask(tx, restartTask); err != nil { log.G(ctx).WithError(err).WithField("task.id", restartTask.ID).Error("task create failed") return err } r.recordRestartHistory(restartTask) r.DelayStart(ctx, tx, &t, restartTask.ID, restartDelay, waitStop) return nil }
// Run runs dispatcher tasks which should be run on leader dispatcher. // Dispatcher can be stopped with cancelling ctx or calling Stop(). func (d *Dispatcher) Run(ctx context.Context) error { d.mu.Lock() if d.isRunning() { d.mu.Unlock() return fmt.Errorf("dispatcher is already running") } logger := log.G(ctx).WithField("module", "dispatcher") ctx = log.WithLogger(ctx, logger) if err := d.markNodesUnknown(ctx); err != nil { logger.Errorf(`failed to move all nodes to "unknown" state: %v`, err) } configWatcher, cancel, err := store.ViewAndWatch( d.store, func(readTx store.ReadTx) error { clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) if err != nil { return err } if err == nil && len(clusters) == 1 { heartbeatPeriod, err := ptypes.Duration(clusters[0].Spec.Dispatcher.HeartbeatPeriod) if err == nil && heartbeatPeriod > 0 { d.config.HeartbeatPeriod = heartbeatPeriod } if clusters[0].NetworkBootstrapKeys != nil { d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys } } return nil }, state.EventUpdateCluster{}, ) if err != nil { d.mu.Unlock() return err } defer cancel() d.ctx, d.cancel = context.WithCancel(ctx) d.mu.Unlock() publishManagers := func() { mgrs := getWeightedPeers(d.cluster) sort.Sort(weightedPeerByNodeID(mgrs)) d.mu.Lock() if reflect.DeepEqual(mgrs, d.lastSeenManagers) { d.mu.Unlock() return } d.lastSeenManagers = mgrs d.mu.Unlock() d.mgrQueue.Publish(mgrs) } publishManagers() publishTicker := time.NewTicker(1 * time.Second) defer publishTicker.Stop() batchTimer := time.NewTimer(maxBatchInterval) defer batchTimer.Stop() for { select { case <-publishTicker.C: publishManagers() case <-d.processTaskUpdatesTrigger: d.processTaskUpdates() batchTimer.Reset(maxBatchInterval) case <-batchTimer.C: d.processTaskUpdates() batchTimer.Reset(maxBatchInterval) case v := <-configWatcher: cluster := v.(state.EventUpdateCluster) d.mu.Lock() if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil { // ignore error, since Spec has passed validation before heartbeatPeriod, _ := ptypes.Duration(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod) if heartbeatPeriod != d.config.HeartbeatPeriod { // only call d.nodes.updatePeriod when heartbeatPeriod changes d.config.HeartbeatPeriod = heartbeatPeriod d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier) } } d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys d.mu.Unlock() d.keyMgrQueue.Publish(struct{}{}) case <-d.ctx.Done(): return nil } } }
// Restart initiates a new task to replace t if appropriate under the service's // restart policy. func (r *Supervisor) Restart(ctx context.Context, tx store.Tx, cluster *api.Cluster, service *api.Service, t api.Task) error { // TODO(aluzzardi): This function should not depend on `service`. // Is the old task still in the process of restarting? If so, wait for // its restart delay to elapse, to avoid tight restart loops (for // example, when the image doesn't exist). r.mu.Lock() oldDelay, ok := r.delays[t.ID] if ok { if !oldDelay.waiter { oldDelay.waiter = true go r.waitRestart(ctx, oldDelay, cluster, t.ID) } r.mu.Unlock() return nil } r.mu.Unlock() // Sanity check: was the task shut down already by a separate call to // Restart? If so, we must avoid restarting it, because this will create // an extra task. This should never happen unless there is a bug. if t.DesiredState > api.TaskStateRunning { return errors.New("Restart called on task that was already shut down") } t.DesiredState = api.TaskStateShutdown err := store.UpdateTask(tx, &t) if err != nil { log.G(ctx).WithError(err).Errorf("failed to set task desired state to dead") return err } if !r.shouldRestart(ctx, &t, service) { return nil } var restartTask *api.Task if orchestrator.IsReplicatedService(service) { restartTask = orchestrator.NewTask(cluster, service, t.Slot, "") } else if orchestrator.IsGlobalService(service) { restartTask = orchestrator.NewTask(cluster, service, 0, t.NodeID) } else { log.G(ctx).Error("service not supported by restart supervisor") return nil } n := store.GetNode(tx, t.NodeID) restartTask.DesiredState = api.TaskStateReady var restartDelay time.Duration // Restart delay is not applied to drained nodes if n == nil || n.Spec.Availability != api.NodeAvailabilityDrain { if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil { var err error restartDelay, err = ptypes.Duration(t.Spec.Restart.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid restart delay; using default") restartDelay = orchestrator.DefaultRestartDelay } } else { restartDelay = orchestrator.DefaultRestartDelay } } waitStop := true // Normally we wait for the old task to stop running, but we skip this // if the old task is already dead or the node it's assigned to is down. if (n != nil && n.Status.State == api.NodeStatus_DOWN) || t.Status.State > api.TaskStateRunning { waitStop = false } if err := store.CreateTask(tx, restartTask); err != nil { log.G(ctx).WithError(err).WithField("task.id", restartTask.ID).Error("task create failed") return err } r.recordRestartHistory(restartTask) r.DelayStart(ctx, tx, &t, restartTask.ID, restartDelay, waitStop) return nil }
func (r *Supervisor) shouldRestart(ctx context.Context, t *api.Task, service *api.Service) bool { // TODO(aluzzardi): This function should not depend on `service`. condition := orchestrator.RestartCondition(t) if condition != api.RestartOnAny && (condition != api.RestartOnFailure || t.Status.State == api.TaskStateCompleted) { return false } if t.Spec.Restart == nil || t.Spec.Restart.MaxAttempts == 0 { return true } instanceTuple := instanceTuple{ instance: t.Slot, serviceID: t.ServiceID, } // Instance is not meaningful for "global" tasks, so they need to be // indexed by NodeID. if orchestrator.IsGlobalService(service) { instanceTuple.nodeID = t.NodeID } r.mu.Lock() defer r.mu.Unlock() restartInfo := r.history[instanceTuple] if restartInfo == nil { return true } if t.Spec.Restart.Window == nil || (t.Spec.Restart.Window.Seconds == 0 && t.Spec.Restart.Window.Nanos == 0) { return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts } if restartInfo.restartedInstances == nil { return true } window, err := ptypes.Duration(t.Spec.Restart.Window) if err != nil { log.G(ctx).WithError(err).Error("invalid restart lookback window") return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts } lookback := time.Now().Add(-window) var next *list.Element for e := restartInfo.restartedInstances.Front(); e != nil; e = next { next = e.Next() if e.Value.(restartedInstance).timestamp.After(lookback) { break } restartInfo.restartedInstances.Remove(e) } numRestarts := uint64(restartInfo.restartedInstances.Len()) if numRestarts == 0 { restartInfo.restartedInstances = nil } return numRestarts < t.Spec.Restart.MaxAttempts }
// Run runs dispatcher tasks which should be run on leader dispatcher. // Dispatcher can be stopped with cancelling ctx or calling Stop(). func (d *Dispatcher) Run(ctx context.Context) error { d.mu.Lock() if d.isRunning() { d.mu.Unlock() return fmt.Errorf("dispatcher is already running") } ctx = log.WithModule(ctx, "dispatcher") if err := d.markNodesUnknown(ctx); err != nil { log.G(ctx).Errorf(`failed to move all nodes to "unknown" state: %v`, err) } configWatcher, cancel, err := store.ViewAndWatch( d.store, func(readTx store.ReadTx) error { clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) if err != nil { return err } if err == nil && len(clusters) == 1 { heartbeatPeriod, err := ptypes.Duration(clusters[0].Spec.Dispatcher.HeartbeatPeriod) if err == nil && heartbeatPeriod > 0 { d.config.HeartbeatPeriod = heartbeatPeriod } if clusters[0].NetworkBootstrapKeys != nil { d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys } } return nil }, state.EventUpdateCluster{}, ) if err != nil { d.mu.Unlock() return err } peerWatcher, peerCancel := d.cluster.SubscribePeers() defer peerCancel() d.lastSeenManagers = getWeightedPeers(d.cluster) defer cancel() d.ctx, d.cancel = context.WithCancel(ctx) d.mu.Unlock() publishManagers := func(peers []*api.Peer) { var mgrs []*api.WeightedPeer for _, p := range peers { mgrs = append(mgrs, &api.WeightedPeer{ Peer: p, Weight: remotes.DefaultObservationWeight, }) } d.mu.Lock() d.lastSeenManagers = mgrs d.mu.Unlock() d.mgrQueue.Publish(mgrs) } batchTimer := time.NewTimer(maxBatchInterval) defer batchTimer.Stop() for { select { case ev := <-peerWatcher: publishManagers(ev.([]*api.Peer)) case <-d.processUpdatesTrigger: d.processUpdates() batchTimer.Reset(maxBatchInterval) case <-batchTimer.C: d.processUpdates() batchTimer.Reset(maxBatchInterval) case v := <-configWatcher: cluster := v.(state.EventUpdateCluster) d.mu.Lock() if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil { // ignore error, since Spec has passed validation before heartbeatPeriod, _ := ptypes.Duration(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod) if heartbeatPeriod != d.config.HeartbeatPeriod { // only call d.nodes.updatePeriod when heartbeatPeriod changes d.config.HeartbeatPeriod = heartbeatPeriod d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier) } } d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys d.mu.Unlock() d.keyMgrQueue.Publish(cluster.Cluster.NetworkBootstrapKeys) case <-d.ctx.Done(): return nil } } }
func (r *Orchestrator) initTasks(ctx context.Context, readTx store.ReadTx) error { tasks, err := store.FindTasks(readTx, store.All) if err != nil { return err } for _, t := range tasks { if t.NodeID != "" { n := store.GetNode(readTx, t.NodeID) if invalidNode(n) && t.Status.State <= api.TaskStateRunning && t.DesiredState <= api.TaskStateRunning { r.restartTasks[t.ID] = struct{}{} } } } _, err = r.store.Batch(func(batch *store.Batch) error { for _, t := range tasks { if t.ServiceID == "" { continue } // TODO(aluzzardi): We should NOT retrieve the service here. service := store.GetService(readTx, t.ServiceID) if service == nil { // Service was deleted err := batch.Update(func(tx store.Tx) error { return store.DeleteTask(tx, t.ID) }) if err != nil { log.G(ctx).WithError(err).Error("failed to set task desired state to dead") } continue } // TODO(aluzzardi): This is shady. We should have a more generic condition. if t.DesiredState != api.TaskStateReady || !orchestrator.IsReplicatedService(service) { continue } restartDelay := orchestrator.DefaultRestartDelay if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil { var err error restartDelay, err = ptypes.Duration(t.Spec.Restart.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid restart delay") restartDelay = orchestrator.DefaultRestartDelay } } if restartDelay != 0 { timestamp, err := ptypes.Timestamp(t.Status.Timestamp) if err == nil { restartTime := timestamp.Add(restartDelay) calculatedRestartDelay := restartTime.Sub(time.Now()) if calculatedRestartDelay < restartDelay { restartDelay = calculatedRestartDelay } if restartDelay > 0 { _ = batch.Update(func(tx store.Tx) error { t := store.GetTask(tx, t.ID) // TODO(aluzzardi): This is shady as well. We should have a more generic condition. if t == nil || t.DesiredState != api.TaskStateReady { return nil } r.restarts.DelayStart(ctx, tx, nil, t.ID, restartDelay, true) return nil }) continue } } else { log.G(ctx).WithError(err).Error("invalid status timestamp") } } // Start now err := batch.Update(func(tx store.Tx) error { return r.restarts.StartNow(tx, t.ID) }) if err != nil { log.G(ctx).WithError(err).WithField("task.id", t.ID).Error("moving task out of delayed state failed") } } return nil }) return err }