func TestHealthcheck(t *testing.T) { c := containerConfig{ task: &api.Task{ Spec: api.TaskSpec{Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{ Healthcheck: &api.HealthConfig{ Test: []string{"a", "b", "c"}, Interval: ptypes.DurationProto(time.Second), Timeout: ptypes.DurationProto(time.Minute), Retries: 10, }, }, }}, }, } config := c.config() expected := &enginecontainer.HealthConfig{ Test: []string{"a", "b", "c"}, Interval: time.Second, Timeout: time.Minute, Retries: 10, } if !reflect.DeepEqual(config.Healthcheck, expected) { t.Fatalf("expected %#v, got %#v", expected, config.Healthcheck) } }
func restartPolicyToGRPC(p *types.RestartPolicy) (*swarmapi.RestartPolicy, error) { var rp *swarmapi.RestartPolicy if p != nil { rp = &swarmapi.RestartPolicy{} switch p.Condition { case types.RestartPolicyConditionNone: rp.Condition = swarmapi.RestartOnNone case types.RestartPolicyConditionOnFailure: rp.Condition = swarmapi.RestartOnFailure case types.RestartPolicyConditionAny: rp.Condition = swarmapi.RestartOnAny default: if string(p.Condition) != "" { return nil, fmt.Errorf("invalid RestartCondition: %q", p.Condition) } rp.Condition = swarmapi.RestartOnAny } if p.Delay != nil { rp.Delay = ptypes.DurationProto(*p.Delay) } if p.Window != nil { rp.Window = ptypes.DurationProto(*p.Window) } if p.MaxAttempts != nil { rp.MaxAttempts = *p.MaxAttempts } } return rp, nil }
func restartPolicyToGRPC(p *types.RestartPolicy) (*swarmapi.RestartPolicy, error) { var rp *swarmapi.RestartPolicy if p != nil { rp = &swarmapi.RestartPolicy{} sanatizedCondition := strings.ToUpper(strings.Replace(string(p.Condition), "-", "_", -1)) if condition, ok := swarmapi.RestartPolicy_RestartCondition_value[sanatizedCondition]; ok { rp.Condition = swarmapi.RestartPolicy_RestartCondition(condition) } else if string(p.Condition) == "" { rp.Condition = swarmapi.RestartOnAny } else { return nil, fmt.Errorf("invalid RestartCondition: %q", p.Condition) } if p.Delay != nil { rp.Delay = ptypes.DurationProto(*p.Delay) } if p.Window != nil { rp.Window = ptypes.DurationProto(*p.Window) } if p.MaxAttempts != nil { rp.MaxAttempts = *p.MaxAttempts } } return rp, nil }
// SwarmSpecToGRPCandMerge converts a Spec to a grpc ClusterSpec and merge AcceptancePolicy from an existing grpc ClusterSpec if provided. func SwarmSpecToGRPCandMerge(s types.Spec, existingSpec *swarmapi.ClusterSpec) (swarmapi.ClusterSpec, error) { spec := swarmapi.ClusterSpec{ Annotations: swarmapi.Annotations{ Name: s.Name, Labels: s.Labels, }, Orchestration: swarmapi.OrchestrationConfig{ TaskHistoryRetentionLimit: s.Orchestration.TaskHistoryRetentionLimit, }, Raft: swarmapi.RaftConfig{ SnapshotInterval: s.Raft.SnapshotInterval, KeepOldSnapshots: s.Raft.KeepOldSnapshots, LogEntriesForSlowFollowers: s.Raft.LogEntriesForSlowFollowers, HeartbeatTick: s.Raft.HeartbeatTick, ElectionTick: s.Raft.ElectionTick, }, Dispatcher: swarmapi.DispatcherConfig{ HeartbeatPeriod: ptypes.DurationProto(time.Duration(s.Dispatcher.HeartbeatPeriod)), }, CAConfig: swarmapi.CAConfig{ NodeCertExpiry: ptypes.DurationProto(s.CAConfig.NodeCertExpiry), }, } if err := SwarmSpecUpdateAcceptancePolicy(&spec, s.AcceptancePolicy, existingSpec); err != nil { return swarmapi.ClusterSpec{}, err } return spec, nil }
func healthConfigToGRPC(h *container.HealthConfig) *swarmapi.HealthConfig { return &swarmapi.HealthConfig{ Test: h.Test, Interval: ptypes.DurationProto(h.Interval), Timeout: ptypes.DurationProto(h.Timeout), Retries: int32(h.Retries), } }
// MergeSwarmSpecToGRPC merges a Spec with an initial grpc ClusterSpec func MergeSwarmSpecToGRPC(s types.Spec, spec swarmapi.ClusterSpec) (swarmapi.ClusterSpec, error) { // We take the initSpec (either created from scratch, or returned by swarmkit), // and will only change the value if the one taken from types.Spec is not nil or 0. // In other words, if the value taken from types.Spec is nil or 0, we will maintain the status quo. if s.Annotations.Name != "" { spec.Annotations.Name = s.Annotations.Name } if len(s.Annotations.Labels) != 0 { spec.Annotations.Labels = s.Annotations.Labels } if s.Orchestration.TaskHistoryRetentionLimit != nil { spec.Orchestration.TaskHistoryRetentionLimit = *s.Orchestration.TaskHistoryRetentionLimit } if s.Raft.SnapshotInterval != 0 { spec.Raft.SnapshotInterval = s.Raft.SnapshotInterval } if s.Raft.KeepOldSnapshots != nil { spec.Raft.KeepOldSnapshots = *s.Raft.KeepOldSnapshots } if s.Raft.LogEntriesForSlowFollowers != 0 { spec.Raft.LogEntriesForSlowFollowers = s.Raft.LogEntriesForSlowFollowers } if s.Raft.HeartbeatTick != 0 { spec.Raft.HeartbeatTick = uint32(s.Raft.HeartbeatTick) } if s.Raft.ElectionTick != 0 { spec.Raft.ElectionTick = uint32(s.Raft.ElectionTick) } if s.Dispatcher.HeartbeatPeriod != 0 { spec.Dispatcher.HeartbeatPeriod = ptypes.DurationProto(time.Duration(s.Dispatcher.HeartbeatPeriod)) } if s.CAConfig.NodeCertExpiry != 0 { spec.CAConfig.NodeCertExpiry = ptypes.DurationProto(s.CAConfig.NodeCertExpiry) } for _, ca := range s.CAConfig.ExternalCAs { protocol, ok := swarmapi.ExternalCA_CAProtocol_value[strings.ToUpper(string(ca.Protocol))] if !ok { return swarmapi.ClusterSpec{}, fmt.Errorf("invalid protocol: %q", ca.Protocol) } spec.CAConfig.ExternalCAs = append(spec.CAConfig.ExternalCAs, &swarmapi.ExternalCA{ Protocol: swarmapi.ExternalCA_CAProtocol(protocol), URL: ca.URL, Options: ca.Options, }) } spec.EncryptionConfig.AutoLockManagers = s.EncryptionConfig.AutoLockManagers return spec, nil }
// defaultClusterObject creates a default cluster. func defaultClusterObject(clusterID string, initialCAConfig api.CAConfig, raftCfg api.RaftConfig, rootCA *ca.RootCA) *api.Cluster { return &api.Cluster{ ID: clusterID, Spec: api.ClusterSpec{ Annotations: api.Annotations{ Name: store.DefaultClusterName, }, Orchestration: api.OrchestrationConfig{ TaskHistoryRetentionLimit: defaultTaskHistoryRetentionLimit, }, Dispatcher: api.DispatcherConfig{ HeartbeatPeriod: ptypes.DurationProto(dispatcher.DefaultHeartBeatPeriod), }, Raft: raftCfg, CAConfig: initialCAConfig, }, RootCA: api.RootCA{ CAKey: rootCA.Key, CACert: rootCA.Cert, CACertHash: rootCA.Digest.String(), JoinTokens: api.JoinTokens{ Worker: ca.GenerateJoinToken(rootCA), Manager: ca.GenerateJoinToken(rootCA), }, }, } }
func TestValidateClusterSpec(t *testing.T) { type BadClusterSpec struct { spec *api.ClusterSpec c codes.Code } for _, bad := range []BadClusterSpec{ { spec: nil, c: codes.InvalidArgument, }, { spec: &api.ClusterSpec{ Annotations: api.Annotations{ Name: "name", }, CAConfig: api.CAConfig{ NodeCertExpiry: ptypes.DurationProto(29 * time.Minute), }, }, c: codes.InvalidArgument, }, { spec: &api.ClusterSpec{ Annotations: api.Annotations{ Name: "name", }, Dispatcher: api.DispatcherConfig{ HeartbeatPeriod: ptypes.DurationProto(-29 * time.Minute), }, }, c: codes.InvalidArgument, }, } { err := validateClusterSpec(bad.spec) assert.Error(t, err) assert.Equal(t, bad.c, grpc.Code(err)) } for _, good := range []*api.ClusterSpec{ createClusterSpec("name"), } { err := validateClusterSpec(good) assert.NoError(t, err) } }
func containerToGRPC(c types.ContainerSpec) (*swarmapi.ContainerSpec, error) { containerSpec := &swarmapi.ContainerSpec{ Image: c.Image, Labels: c.Labels, Command: c.Command, Args: c.Args, Env: c.Env, Dir: c.Dir, User: c.User, Groups: c.Groups, } if c.StopGracePeriod != nil { containerSpec.StopGracePeriod = ptypes.DurationProto(*c.StopGracePeriod) } // Mounts for _, m := range c.Mounts { mount := swarmapi.Mount{ Target: m.Target, Source: m.Source, ReadOnly: m.ReadOnly, } if mountType, ok := swarmapi.Mount_MountType_value[strings.ToUpper(string(m.Type))]; ok { mount.Type = swarmapi.Mount_MountType(mountType) } else if string(m.Type) != "" { return nil, fmt.Errorf("invalid MountType: %q", m.Type) } if m.BindOptions != nil { if mountPropagation, ok := swarmapi.Mount_BindOptions_MountPropagation_value[strings.ToUpper(string(m.BindOptions.Propagation))]; ok { mount.BindOptions = &swarmapi.Mount_BindOptions{Propagation: swarmapi.Mount_BindOptions_MountPropagation(mountPropagation)} } else if string(m.BindOptions.Propagation) != "" { return nil, fmt.Errorf("invalid MountPropagation: %q", m.BindOptions.Propagation) } } if m.VolumeOptions != nil { mount.VolumeOptions = &swarmapi.Mount_VolumeOptions{ NoCopy: m.VolumeOptions.NoCopy, Labels: m.VolumeOptions.Labels, } if m.VolumeOptions.DriverConfig != nil { mount.VolumeOptions.DriverConfig = &swarmapi.Driver{ Name: m.VolumeOptions.DriverConfig.Name, Options: m.VolumeOptions.DriverConfig.Options, } } } containerSpec.Mounts = append(containerSpec.Mounts, mount) } return containerSpec, nil }
func createClusterSpec(name string) *api.ClusterSpec { return &api.ClusterSpec{ Annotations: api.Annotations{ Name: name, }, CAConfig: api.CAConfig{ NodeCertExpiry: ptypes.DurationProto(ca.DefaultNodeCertExpiration), }, } }
// SwarmSpecToGRPCandMerge converts a Spec to a grpc ClusterSpec and merge AcceptancePolicy from an existing grpc ClusterSpec if provided. func SwarmSpecToGRPCandMerge(s types.Spec, existingSpec *swarmapi.ClusterSpec) (swarmapi.ClusterSpec, error) { spec := swarmapi.ClusterSpec{ Annotations: swarmapi.Annotations{ Name: s.Name, Labels: s.Labels, }, Orchestration: swarmapi.OrchestrationConfig{ TaskHistoryRetentionLimit: s.Orchestration.TaskHistoryRetentionLimit, }, Raft: swarmapi.RaftConfig{ SnapshotInterval: s.Raft.SnapshotInterval, KeepOldSnapshots: s.Raft.KeepOldSnapshots, LogEntriesForSlowFollowers: s.Raft.LogEntriesForSlowFollowers, HeartbeatTick: s.Raft.HeartbeatTick, ElectionTick: s.Raft.ElectionTick, }, Dispatcher: swarmapi.DispatcherConfig{ HeartbeatPeriod: ptypes.DurationProto(time.Duration(s.Dispatcher.HeartbeatPeriod)), }, CAConfig: swarmapi.CAConfig{ NodeCertExpiry: ptypes.DurationProto(s.CAConfig.NodeCertExpiry), }, } for _, ca := range s.CAConfig.ExternalCAs { protocol, ok := swarmapi.ExternalCA_CAProtocol_value[strings.ToUpper(string(ca.Protocol))] if !ok { return swarmapi.ClusterSpec{}, fmt.Errorf("invalid protocol: %q", ca.Protocol) } spec.CAConfig.ExternalCAs = append(spec.CAConfig.ExternalCAs, &swarmapi.ExternalCA{ Protocol: swarmapi.ExternalCA_CAProtocol(protocol), URL: ca.URL, Options: ca.Options, }) } if err := SwarmSpecUpdateAcceptancePolicy(&spec, s.AcceptancePolicy, existingSpec); err != nil { return swarmapi.ClusterSpec{}, err } return spec, nil }
func TestValidateRestartPolicy(t *testing.T) { bad := []*api.RestartPolicy{ { Delay: ptypes.DurationProto(time.Duration(-1 * time.Second)), Window: ptypes.DurationProto(time.Duration(-1 * time.Second)), }, { Delay: ptypes.DurationProto(time.Duration(20 * time.Second)), Window: ptypes.DurationProto(time.Duration(-4 * time.Second)), }, } good := []*api.RestartPolicy{ { Delay: ptypes.DurationProto(time.Duration(10 * time.Second)), Window: ptypes.DurationProto(time.Duration(1 * time.Second)), }, } for _, b := range bad { err := validateRestartPolicy(b) assert.Error(t, err) assert.Equal(t, codes.InvalidArgument, grpc.Code(err)) } for _, g := range good { assert.NoError(t, validateRestartPolicy(g)) } }
func genTask(t *testing.T) *api.Task { const ( nodeID = "dockerexec-test-node-id" serviceID = "dockerexec-test-service" reference = "stevvooe/foo:latest" ) return &api.Task{ ID: identity.NewID(), ServiceID: serviceID, NodeID: nodeID, Spec: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{ Image: reference, StopGracePeriod: ptypes.DurationProto(10 * time.Second), }, }, }, } }
// Run starts all manager sub-systems and the gRPC server at the configured // address. // The call never returns unless an error occurs or `Stop()` is called. // // TODO(aluzzardi): /!\ This function is *way* too complex. /!\ // It needs to be split into smaller manageable functions. func (m *Manager) Run(parent context.Context) error { ctx, ctxCancel := context.WithCancel(parent) defer ctxCancel() // Harakiri. go func() { select { case <-ctx.Done(): case <-m.stopped: ctxCancel() } }() leadershipCh, cancel := m.RaftNode.SubscribeLeadership() defer cancel() go func() { for leadershipEvent := range leadershipCh { // read out and discard all of the messages when we've stopped // don't acquire the mutex yet. if stopped is closed, we don't need // this stops this loop from starving Run()'s attempt to Lock select { case <-m.stopped: continue default: // do nothing, we're not stopped } // we're not stopping so NOW acquire the mutex m.mu.Lock() newState := leadershipEvent.(raft.LeadershipState) if newState == raft.IsLeader { s := m.RaftNode.MemoryStore() rootCA := m.config.SecurityConfig.RootCA() nodeID := m.config.SecurityConfig.ClientTLSCreds.NodeID() raftCfg := raft.DefaultRaftConfig() raftCfg.ElectionTick = uint32(m.RaftNode.Config.ElectionTick) raftCfg.HeartbeatTick = uint32(m.RaftNode.Config.HeartbeatTick) clusterID := m.config.SecurityConfig.ClientTLSCreds.Organization() initialCAConfig := ca.DefaultCAConfig() initialCAConfig.ExternalCAs = m.config.ExternalCAs s.Update(func(tx store.Tx) error { // Add a default cluster object to the // store. Don't check the error because // we expect this to fail unless this // is a brand new cluster. store.CreateCluster(tx, &api.Cluster{ ID: clusterID, Spec: api.ClusterSpec{ Annotations: api.Annotations{ Name: store.DefaultClusterName, }, Orchestration: api.OrchestrationConfig{ TaskHistoryRetentionLimit: defaultTaskHistoryRetentionLimit, }, Dispatcher: api.DispatcherConfig{ HeartbeatPeriod: ptypes.DurationProto(dispatcher.DefaultHeartBeatPeriod), }, Raft: raftCfg, CAConfig: initialCAConfig, }, RootCA: api.RootCA{ CAKey: rootCA.Key, CACert: rootCA.Cert, CACertHash: rootCA.Digest.String(), JoinTokens: api.JoinTokens{ Worker: ca.GenerateJoinToken(rootCA), Manager: ca.GenerateJoinToken(rootCA), }, }, }) // Add Node entry for ourself, if one // doesn't exist already. store.CreateNode(tx, &api.Node{ ID: nodeID, Certificate: api.Certificate{ CN: nodeID, Role: api.NodeRoleManager, Status: api.IssuanceStatus{ State: api.IssuanceStateIssued, }, }, Spec: api.NodeSpec{ Role: api.NodeRoleManager, Membership: api.NodeMembershipAccepted, }, }) return nil }) // Attempt to rotate the key-encrypting-key of the root CA key-material err := m.rotateRootCAKEK(ctx, clusterID) if err != nil { log.G(ctx).WithError(err).Error("root key-encrypting-key rotation failed") } m.replicatedOrchestrator = orchestrator.NewReplicatedOrchestrator(s) m.globalOrchestrator = orchestrator.NewGlobalOrchestrator(s) m.taskReaper = orchestrator.NewTaskReaper(s) m.scheduler = scheduler.New(s) m.keyManager = keymanager.New(m.RaftNode.MemoryStore(), keymanager.DefaultConfig()) // TODO(stevvooe): Allocate a context that can be used to // shutdown underlying manager processes when leadership is // lost. m.allocator, err = allocator.New(s) if err != nil { log.G(ctx).WithError(err).Error("failed to create allocator") // TODO(stevvooe): It doesn't seem correct here to fail // creating the allocator but then use it anyway. } if m.keyManager != nil { go func(keyManager *keymanager.KeyManager) { if err := keyManager.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("keymanager failed with an error") } }(m.keyManager) } go func(d *dispatcher.Dispatcher) { if err := d.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("Dispatcher exited with an error") } }(m.Dispatcher) go func(server *ca.Server) { if err := server.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("CA signer exited with an error") } }(m.caserver) // Start all sub-components in separate goroutines. // TODO(aluzzardi): This should have some kind of error handling so that // any component that goes down would bring the entire manager down. if m.allocator != nil { go func(allocator *allocator.Allocator) { if err := allocator.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("allocator exited with an error") } }(m.allocator) } go func(scheduler *scheduler.Scheduler) { if err := scheduler.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("scheduler exited with an error") } }(m.scheduler) go func(taskReaper *orchestrator.TaskReaper) { taskReaper.Run() }(m.taskReaper) go func(orchestrator *orchestrator.ReplicatedOrchestrator) { if err := orchestrator.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("replicated orchestrator exited with an error") } }(m.replicatedOrchestrator) go func(globalOrchestrator *orchestrator.GlobalOrchestrator) { if err := globalOrchestrator.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("global orchestrator exited with an error") } }(m.globalOrchestrator) } else if newState == raft.IsFollower { m.Dispatcher.Stop() m.caserver.Stop() if m.allocator != nil { m.allocator.Stop() m.allocator = nil } m.replicatedOrchestrator.Stop() m.replicatedOrchestrator = nil m.globalOrchestrator.Stop() m.globalOrchestrator = nil m.taskReaper.Stop() m.taskReaper = nil m.scheduler.Stop() m.scheduler = nil if m.keyManager != nil { m.keyManager.Stop() m.keyManager = nil } } m.mu.Unlock() } }() proxyOpts := []grpc.DialOption{ grpc.WithTimeout(5 * time.Second), grpc.WithTransportCredentials(m.config.SecurityConfig.ClientTLSCreds), } cs := raftpicker.NewConnSelector(m.RaftNode, proxyOpts...) m.connSelector = cs // We need special connSelector for controlapi because it provides automatic // leader tracking. // Other APIs are using connSelector which errors out on leader change, but // allows to react quickly to reelections. controlAPIProxyOpts := []grpc.DialOption{ grpc.WithBackoffMaxDelay(time.Second), grpc.WithTransportCredentials(m.config.SecurityConfig.ClientTLSCreds), } controlAPIConnSelector := hackpicker.NewConnSelector(m.RaftNode, controlAPIProxyOpts...) authorize := func(ctx context.Context, roles []string) error { // Authorize the remote roles, ensure they can only be forwarded by managers _, err := ca.AuthorizeForwardedRoleAndOrg(ctx, roles, []string{ca.ManagerRole}, m.config.SecurityConfig.ClientTLSCreds.Organization()) return err } baseControlAPI := controlapi.NewServer(m.RaftNode.MemoryStore(), m.RaftNode, m.config.SecurityConfig.RootCA()) healthServer := health.NewHealthServer() authenticatedControlAPI := api.NewAuthenticatedWrapperControlServer(baseControlAPI, authorize) authenticatedDispatcherAPI := api.NewAuthenticatedWrapperDispatcherServer(m.Dispatcher, authorize) authenticatedCAAPI := api.NewAuthenticatedWrapperCAServer(m.caserver, authorize) authenticatedNodeCAAPI := api.NewAuthenticatedWrapperNodeCAServer(m.caserver, authorize) authenticatedRaftAPI := api.NewAuthenticatedWrapperRaftServer(m.RaftNode, authorize) authenticatedHealthAPI := api.NewAuthenticatedWrapperHealthServer(healthServer, authorize) authenticatedRaftMembershipAPI := api.NewAuthenticatedWrapperRaftMembershipServer(m.RaftNode, authorize) proxyDispatcherAPI := api.NewRaftProxyDispatcherServer(authenticatedDispatcherAPI, cs, m.RaftNode, ca.WithMetadataForwardTLSInfo) proxyCAAPI := api.NewRaftProxyCAServer(authenticatedCAAPI, cs, m.RaftNode, ca.WithMetadataForwardTLSInfo) proxyNodeCAAPI := api.NewRaftProxyNodeCAServer(authenticatedNodeCAAPI, cs, m.RaftNode, ca.WithMetadataForwardTLSInfo) proxyRaftMembershipAPI := api.NewRaftProxyRaftMembershipServer(authenticatedRaftMembershipAPI, cs, m.RaftNode, ca.WithMetadataForwardTLSInfo) // localProxyControlAPI is a special kind of proxy. It is only wired up // to receive requests from a trusted local socket, and these requests // don't use TLS, therefore the requests it handles locally should // bypass authorization. When it proxies, it sends them as requests from // this manager rather than forwarded requests (it has no TLS // information to put in the metadata map). forwardAsOwnRequest := func(ctx context.Context) (context.Context, error) { return ctx, nil } localProxyControlAPI := api.NewRaftProxyControlServer(baseControlAPI, controlAPIConnSelector, m.RaftNode, forwardAsOwnRequest) // Everything registered on m.server should be an authenticated // wrapper, or a proxy wrapping an authenticated wrapper! api.RegisterCAServer(m.server, proxyCAAPI) api.RegisterNodeCAServer(m.server, proxyNodeCAAPI) api.RegisterRaftServer(m.server, authenticatedRaftAPI) api.RegisterHealthServer(m.server, authenticatedHealthAPI) api.RegisterRaftMembershipServer(m.server, proxyRaftMembershipAPI) api.RegisterControlServer(m.localserver, localProxyControlAPI) api.RegisterControlServer(m.server, authenticatedControlAPI) api.RegisterDispatcherServer(m.server, proxyDispatcherAPI) errServe := make(chan error, 2) for proto, l := range m.listeners { go func(proto string, lis net.Listener) { ctx := log.WithLogger(ctx, log.G(ctx).WithFields( logrus.Fields{ "proto": lis.Addr().Network(), "addr": lis.Addr().String()})) if proto == "unix" { log.G(ctx).Info("Listening for local connections") // we need to disallow double closes because UnixListener.Close // can delete unix-socket file of newer listener. grpc calls // Close twice indeed: in Serve and in Stop. errServe <- m.localserver.Serve(&closeOnceListener{Listener: lis}) } else { log.G(ctx).Info("Listening for connections") errServe <- m.server.Serve(lis) } }(proto, l) } // Set the raft server as serving for the health server healthServer.SetServingStatus("Raft", api.HealthCheckResponse_SERVING) if err := m.RaftNode.JoinAndStart(); err != nil { for _, lis := range m.listeners { lis.Close() } return fmt.Errorf("can't initialize raft node: %v", err) } close(m.started) go func() { err := m.RaftNode.Run(ctx) if err != nil { log.G(ctx).Error(err) m.Stop(ctx) } }() if err := raft.WaitForLeader(ctx, m.RaftNode); err != nil { m.server.Stop() return err } c, err := raft.WaitForCluster(ctx, m.RaftNode) if err != nil { m.server.Stop() return err } raftConfig := c.Spec.Raft if int(raftConfig.ElectionTick) != m.RaftNode.Config.ElectionTick { log.G(ctx).Warningf("election tick value (%ds) is different from the one defined in the cluster config (%vs), the cluster may be unstable", m.RaftNode.Config.ElectionTick, raftConfig.ElectionTick) } if int(raftConfig.HeartbeatTick) != m.RaftNode.Config.HeartbeatTick { log.G(ctx).Warningf("heartbeat tick value (%ds) is different from the one defined in the cluster config (%vs), the cluster may be unstable", m.RaftNode.Config.HeartbeatTick, raftConfig.HeartbeatTick) } // wait for an error in serving. err = <-errServe select { // check to see if stopped was posted to. if so, we're in the process of // stopping, or done and that's why we got the error. if stopping is // deliberate, stopped will ALWAYS be closed before the error is trigger, // so this path will ALWAYS be taken if the stop was deliberate case <-m.stopped: // shutdown was requested, do not return an error // but first, we wait to acquire a mutex to guarantee that stopping is // finished. as long as we acquire the mutex BEFORE we return, we know // that stopping is stopped. m.mu.Lock() m.mu.Unlock() return nil // otherwise, we'll get something from errServe, which indicates that an // error in serving has actually occurred and this isn't a planned shutdown default: return err } }
func TestUpdaterRollback(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) defer s.Close() orchestrator := NewReplicatedOrchestrator(s) defer orchestrator.Stop() var ( failImage1 uint32 failImage2 uint32 ) watchCreate, cancelCreate := state.Watch(s.WatchQueue(), state.EventCreateTask{}) defer cancelCreate() watchServiceUpdate, cancelServiceUpdate := state.Watch(s.WatchQueue(), state.EventUpdateService{}) defer cancelServiceUpdate() // Fail new tasks the updater tries to run watchUpdate, cancelUpdate := state.Watch(s.WatchQueue(), state.EventUpdateTask{}) defer cancelUpdate() go func() { failedLast := false for { select { case e := <-watchUpdate: task := e.(state.EventUpdateTask).Task if task.DesiredState == task.Status.State { continue } if task.DesiredState == api.TaskStateRunning && task.Status.State != api.TaskStateFailed && task.Status.State != api.TaskStateRunning { err := s.Update(func(tx store.Tx) error { task = store.GetTask(tx, task.ID) // Never fail two image2 tasks in a row, so there's a mix of // failed and successful tasks for the rollback. if task.Spec.GetContainer().Image == "image1" && atomic.LoadUint32(&failImage1) == 1 { task.Status.State = api.TaskStateFailed failedLast = true } else if task.Spec.GetContainer().Image == "image2" && atomic.LoadUint32(&failImage2) == 1 && !failedLast { task.Status.State = api.TaskStateFailed failedLast = true } else { task.Status.State = task.DesiredState failedLast = false } return store.UpdateTask(tx, task) }) assert.NoError(t, err) } else if task.DesiredState > api.TaskStateRunning { err := s.Update(func(tx store.Tx) error { task = store.GetTask(tx, task.ID) task.Status.State = task.DesiredState return store.UpdateTask(tx, task) }) assert.NoError(t, err) } } } }() // Create a service with four replicas specified before the orchestrator // is started. This should result in two tasks when the orchestrator // starts up. err := s.Update(func(tx store.Tx) error { s1 := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{ Image: "image1", }, }, Restart: &api.RestartPolicy{ Condition: api.RestartOnNone, }, }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 4, }, }, Update: &api.UpdateConfig{ FailureAction: api.UpdateConfig_ROLLBACK, Parallelism: 1, Delay: *ptypes.DurationProto(10 * time.Millisecond), Monitor: ptypes.DurationProto(500 * time.Millisecond), MaxFailureRatio: 0.4, }, }, } assert.NoError(t, store.CreateService(tx, s1)) return nil }) assert.NoError(t, err) // Start the orchestrator. go func() { assert.NoError(t, orchestrator.Run(ctx)) }() observedTask := testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") atomic.StoreUint32(&failImage2, 1) // Start a rolling update err = s.Update(func(tx store.Tx) error { s1 := store.GetService(tx, "id1") require.NotNil(t, s1) s1.PreviousSpec = s1.Spec.Copy() s1.UpdateStatus = nil s1.Spec.Task.GetContainer().Image = "image2" assert.NoError(t, store.UpdateService(tx, s1)) return nil }) assert.NoError(t, err) // Should see three tasks started, then a rollback observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") // Should get to the ROLLBACK_STARTED state for { e := <-watchServiceUpdate if e.(state.EventUpdateService).Service.UpdateStatus == nil { continue } if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { break } } observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") // Should end up in ROLLBACK_COMPLETED state for { e := <-watchServiceUpdate if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_COMPLETED { break } } atomic.StoreUint32(&failImage1, 1) // Repeat the rolling update but this time fail the tasks that the // rollback creates. It should end up in ROLLBACK_PAUSED. err = s.Update(func(tx store.Tx) error { s1 := store.GetService(tx, "id1") require.NotNil(t, s1) s1.PreviousSpec = s1.Spec.Copy() s1.UpdateStatus = nil s1.Spec.Task.GetContainer().Image = "image2" assert.NoError(t, store.UpdateService(tx, s1)) return nil }) assert.NoError(t, err) // Should see three tasks started, then a rollback observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") // Should get to the ROLLBACK_STARTED state for { e := <-watchServiceUpdate if e.(state.EventUpdateService).Service.UpdateStatus == nil { continue } if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { break } } observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") // Should end up in ROLLBACK_PAUSED state for { e := <-watchServiceUpdate if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_PAUSED { break } } }
// DefaultCAConfig returns the default CA Config, with a default expiration. func DefaultCAConfig() api.CAConfig { return api.CAConfig{ NodeCertExpiry: ptypes.DurationProto(DefaultNodeCertExpiration), } }
func TestUpdaterFailureAction(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) defer s.Close() // Fail new tasks the updater tries to run watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{}) defer cancel() go func() { for { select { case e := <-watch: task := e.(state.EventUpdateTask).Task if task.DesiredState == api.TaskStateRunning && task.Status.State != api.TaskStateFailed { err := s.Update(func(tx store.Tx) error { task = store.GetTask(tx, task.ID) task.Status.State = api.TaskStateFailed return store.UpdateTask(tx, task) }) assert.NoError(t, err) } else if task.DesiredState > api.TaskStateRunning { err := s.Update(func(tx store.Tx) error { task = store.GetTask(tx, task.ID) task.Status.State = task.DesiredState return store.UpdateTask(tx, task) }) assert.NoError(t, err) } } } }() instances := 3 cluster := &api.Cluster{ Spec: api.ClusterSpec{ Annotations: api.Annotations{ Name: "default", }, }, } service := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: uint64(instances), }, }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{ Image: "v:1", // This won't apply in this test because we set the old tasks to DEAD. StopGracePeriod: ptypes.DurationProto(time.Hour), }, }, }, Update: &api.UpdateConfig{ FailureAction: api.UpdateConfig_PAUSE, Parallelism: 1, Delay: *ptypes.DurationProto(500 * time.Millisecond), }, }, } err := s.Update(func(tx store.Tx) error { assert.NoError(t, store.CreateCluster(tx, cluster)) assert.NoError(t, store.CreateService(tx, service)) for i := 0; i < instances; i++ { assert.NoError(t, store.CreateTask(tx, newTask(cluster, service, uint64(i)))) } return nil }) assert.NoError(t, err) originalTasks := getRunnableSlotSlice(t, s, service) for _, slot := range originalTasks { for _, task := range slot { assert.Equal(t, "v:1", task.Spec.GetContainer().Image) } } service.Spec.Task.GetContainer().Image = "v:2" updater := NewUpdater(s, NewRestartSupervisor(s), cluster, service) updater.Run(ctx, getRunnableSlotSlice(t, s, service)) updatedTasks := getRunnableSlotSlice(t, s, service) v1Counter := 0 v2Counter := 0 for _, slot := range updatedTasks { for _, task := range slot { if task.Spec.GetContainer().Image == "v:1" { v1Counter++ } else if task.Spec.GetContainer().Image == "v:2" { v2Counter++ } } } assert.Equal(t, instances-1, v1Counter) assert.Equal(t, 1, v2Counter) s.View(func(tx store.ReadTx) { service = store.GetService(tx, service.ID) }) assert.Equal(t, api.UpdateStatus_PAUSED, service.UpdateStatus.State) // Updating again should do nothing while the update is PAUSED updater = NewUpdater(s, NewRestartSupervisor(s), cluster, service) updater.Run(ctx, getRunnableSlotSlice(t, s, service)) updatedTasks = getRunnableSlotSlice(t, s, service) v1Counter = 0 v2Counter = 0 for _, slot := range updatedTasks { for _, task := range slot { if task.Spec.GetContainer().Image == "v:1" { v1Counter++ } else if task.Spec.GetContainer().Image == "v:2" { v2Counter++ } } } assert.Equal(t, instances-1, v1Counter) assert.Equal(t, 1, v2Counter) // Switch to a service with FailureAction: CONTINUE err = s.Update(func(tx store.Tx) error { service = store.GetService(tx, service.ID) service.Spec.Update.FailureAction = api.UpdateConfig_CONTINUE service.UpdateStatus = nil assert.NoError(t, store.UpdateService(tx, service)) return nil }) assert.NoError(t, err) service.Spec.Task.GetContainer().Image = "v:3" updater = NewUpdater(s, NewRestartSupervisor(s), cluster, service) updater.Run(ctx, getRunnableSlotSlice(t, s, service)) updatedTasks = getRunnableSlotSlice(t, s, service) v2Counter = 0 v3Counter := 0 for _, slot := range updatedTasks { for _, task := range slot { if task.Spec.GetContainer().Image == "v:2" { v2Counter++ } else if task.Spec.GetContainer().Image == "v:3" { v3Counter++ } } } assert.Equal(t, 0, v2Counter) assert.Equal(t, instances, v3Counter) }
func TestOrchestratorRestartOnAny(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) orchestrator := NewReplicatedOrchestrator(s) defer orchestrator.Stop() watch, cancel := state.Watch(s.WatchQueue() /*state.EventCreateTask{}, state.EventUpdateTask{}*/) defer cancel() // Create a service with two instances specified before the orchestrator is // started. This should result in two tasks when the orchestrator // starts up. err := s.Update(func(tx store.Tx) error { j1 := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{}, }, Restart: &api.RestartPolicy{ Condition: api.RestartOnAny, Delay: ptypes.DurationProto(0), }, }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 2, }, }, }, } assert.NoError(t, store.CreateService(tx, j1)) return nil }) assert.NoError(t, err) // Start the orchestrator. go func() { assert.NoError(t, orchestrator.Run(ctx)) }() observedTask1 := watchTaskCreate(t, watch) assert.Equal(t, observedTask1.Status.State, api.TaskStateNew) assert.Equal(t, observedTask1.ServiceAnnotations.Name, "name1") observedTask2 := watchTaskCreate(t, watch) assert.Equal(t, observedTask2.Status.State, api.TaskStateNew) assert.Equal(t, observedTask2.ServiceAnnotations.Name, "name1") // Fail the first task. Confirm that it gets restarted. updatedTask1 := observedTask1.Copy() updatedTask1.Status = api.TaskStatus{State: api.TaskStateFailed} err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.UpdateTask(tx, updatedTask1)) return nil }) assert.NoError(t, err) expectCommit(t, watch) expectTaskUpdate(t, watch) expectCommit(t, watch) expectTaskUpdate(t, watch) observedTask3 := watchTaskCreate(t, watch) assert.Equal(t, observedTask3.Status.State, api.TaskStateNew) assert.Equal(t, observedTask3.ServiceAnnotations.Name, "name1") expectCommit(t, watch) observedTask4 := watchTaskUpdate(t, watch) assert.Equal(t, observedTask4.DesiredState, api.TaskStateRunning) assert.Equal(t, observedTask4.ServiceAnnotations.Name, "name1") // Mark the second task as completed. Confirm that it gets restarted. updatedTask2 := observedTask2.Copy() updatedTask2.Status = api.TaskStatus{State: api.TaskStateCompleted} err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.UpdateTask(tx, updatedTask2)) return nil }) assert.NoError(t, err) expectCommit(t, watch) expectTaskUpdate(t, watch) expectCommit(t, watch) expectTaskUpdate(t, watch) observedTask5 := watchTaskCreate(t, watch) assert.Equal(t, observedTask5.Status.State, api.TaskStateNew) assert.Equal(t, observedTask5.ServiceAnnotations.Name, "name1") expectCommit(t, watch) observedTask6 := watchTaskUpdate(t, watch) assert.Equal(t, observedTask6.DesiredState, api.TaskStateRunning) assert.Equal(t, observedTask6.ServiceAnnotations.Name, "name1") }
if err != nil { return err } for _, policy := range spec.AcceptancePolicy.Policies { policy.Secret = &api.AcceptancePolicy_RoleAdmissionPolicy_HashedSecret{ Data: hashedSecret, Alg: "bcrypt", } } } if flags.Changed("certexpiry") { cePeriod, err := flags.GetDuration("certexpiry") if err != nil { return err } ceProtoPeriod := ptypes.DurationProto(cePeriod) spec.CAConfig.NodeCertExpiry = ceProtoPeriod } if flags.Changed("taskhistory") { taskHistory, err := flags.GetInt64("taskhistory") if err != nil { return err } spec.Orchestration.TaskHistoryRetentionLimit = taskHistory } if flags.Changed("heartbeatperiod") { hbPeriod, err := flags.GetDuration("heartbeatperiod") if err != nil { return err } spec.Dispatcher.HeartbeatPeriod = ptypes.DurationProto(hbPeriod)
func TestUpdater(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) // Move tasks to their desired state. watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{}) defer cancel() go func() { for { select { case e := <-watch: task := e.(state.EventUpdateTask).Task if task.Status.State == task.DesiredState { continue } err := s.Update(func(tx store.Tx) error { task = store.GetTask(tx, task.ID) task.Status.State = task.DesiredState return store.UpdateTask(tx, task) }) assert.NoError(t, err) } } }() instances := 3 cluster := &api.Cluster{ // test cluster configuration propagation to task creation. Spec: api.ClusterSpec{ Annotations: api.Annotations{ Name: "default", }, }, } service := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: uint64(instances), }, }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{ Image: "v:1", // This won't apply in this test because we set the old tasks to DEAD. StopGracePeriod: ptypes.DurationProto(time.Hour), }, }, }, }, } err := s.Update(func(tx store.Tx) error { assert.NoError(t, store.CreateCluster(tx, cluster)) assert.NoError(t, store.CreateService(tx, service)) for i := 0; i < instances; i++ { assert.NoError(t, store.CreateTask(tx, newTask(cluster, service, uint64(i)))) } return nil }) assert.NoError(t, err) originalTasks := getRunnableServiceTasks(t, s, service) for _, task := range originalTasks { assert.Equal(t, "v:1", task.Spec.GetContainer().Image) assert.Nil(t, task.LogDriver) // should be left alone } service.Spec.Task.GetContainer().Image = "v:2" service.Spec.Task.LogDriver = &api.Driver{Name: "tasklogdriver"} updater := NewUpdater(s, NewRestartSupervisor(s)) updater.Run(ctx, cluster, service, getRunnableServiceTasks(t, s, service)) updatedTasks := getRunnableServiceTasks(t, s, service) for _, task := range updatedTasks { assert.Equal(t, "v:2", task.Spec.GetContainer().Image) assert.Equal(t, service.Spec.Task.LogDriver, task.LogDriver) // pick up from task } service.Spec.Task.GetContainer().Image = "v:3" cluster.Spec.DefaultLogDriver = &api.Driver{Name: "clusterlogdriver"} // make cluster default logdriver. service.Spec.Update = &api.UpdateConfig{ Parallelism: 1, } updater = NewUpdater(s, NewRestartSupervisor(s)) updater.Run(ctx, cluster, service, getRunnableServiceTasks(t, s, service)) updatedTasks = getRunnableServiceTasks(t, s, service) for _, task := range updatedTasks { assert.Equal(t, "v:3", task.Spec.GetContainer().Image) assert.Equal(t, service.Spec.Task.LogDriver, task.LogDriver) // still pick up from task } service.Spec.Task.GetContainer().Image = "v:4" service.Spec.Task.LogDriver = nil // use cluster default now. service.Spec.Update = &api.UpdateConfig{ Parallelism: 1, Delay: *ptypes.DurationProto(10 * time.Millisecond), } updater = NewUpdater(s, NewRestartSupervisor(s)) updater.Run(ctx, cluster, service, getRunnableServiceTasks(t, s, service)) updatedTasks = getRunnableServiceTasks(t, s, service) for _, task := range updatedTasks { assert.Equal(t, "v:4", task.Spec.GetContainer().Image) assert.Equal(t, cluster.Spec.DefaultLogDriver, task.LogDriver) // pick up from cluster } }
func TestUpdaterStopGracePeriod(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) // Move tasks to their desired state. watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{}) defer cancel() go func() { for { select { case e := <-watch: task := e.(state.EventUpdateTask).Task err := s.Update(func(tx store.Tx) error { task = store.GetTask(tx, task.ID) // Explicitly do not set task state to // DEAD to trigger StopGracePeriod if task.DesiredState == api.TaskStateRunning && task.Status.State != api.TaskStateRunning { task.Status.State = api.TaskStateRunning return store.UpdateTask(tx, task) } return nil }) assert.NoError(t, err) } } }() var instances uint64 = 3 service := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{ Image: "v:1", StopGracePeriod: ptypes.DurationProto(100 * time.Millisecond), }, }, }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: instances, }, }, }, } err := s.Update(func(tx store.Tx) error { assert.NoError(t, store.CreateService(tx, service)) for i := uint64(0); i < instances; i++ { task := newTask(nil, service, uint64(i)) task.Status.State = api.TaskStateRunning assert.NoError(t, store.CreateTask(tx, task)) } return nil }) assert.NoError(t, err) originalTasks := getRunnableServiceTasks(t, s, service) for _, task := range originalTasks { assert.Equal(t, "v:1", task.Spec.GetContainer().Image) } before := time.Now() service.Spec.Task.GetContainer().Image = "v:2" updater := NewUpdater(s, NewRestartSupervisor(s)) // Override the default (1 minute) to speed up the test. updater.restarts.taskTimeout = 100 * time.Millisecond updater.Run(ctx, nil, service, getRunnableServiceTasks(t, s, service)) updatedTasks := getRunnableServiceTasks(t, s, service) for _, task := range updatedTasks { assert.Equal(t, "v:2", task.Spec.GetContainer().Image) } after := time.Now() // At least 100 ms should have elapsed. Only check the lower bound, // because the system may be slow and it could have taken longer. if after.Sub(before) < 100*time.Millisecond { t.Fatal("stop timeout should have elapsed") } }
func containerToGRPC(c types.ContainerSpec) (*swarmapi.ContainerSpec, error) { containerSpec := &swarmapi.ContainerSpec{ Image: c.Image, Labels: c.Labels, Command: c.Command, Args: c.Args, Hostname: c.Hostname, Env: c.Env, Dir: c.Dir, User: c.User, Groups: c.Groups, TTY: c.TTY, OpenStdin: c.OpenStdin, Hosts: c.Hosts, Secrets: secretReferencesToGRPC(c.Secrets), } if c.DNSConfig != nil { containerSpec.DNSConfig = &swarmapi.ContainerSpec_DNSConfig{ Nameservers: c.DNSConfig.Nameservers, Search: c.DNSConfig.Search, Options: c.DNSConfig.Options, } } if c.StopGracePeriod != nil { containerSpec.StopGracePeriod = ptypes.DurationProto(*c.StopGracePeriod) } // Mounts for _, m := range c.Mounts { mount := swarmapi.Mount{ Target: m.Target, Source: m.Source, ReadOnly: m.ReadOnly, } if mountType, ok := swarmapi.Mount_MountType_value[strings.ToUpper(string(m.Type))]; ok { mount.Type = swarmapi.Mount_MountType(mountType) } else if string(m.Type) != "" { return nil, fmt.Errorf("invalid MountType: %q", m.Type) } if m.BindOptions != nil { if mountPropagation, ok := swarmapi.Mount_BindOptions_MountPropagation_value[strings.ToUpper(string(m.BindOptions.Propagation))]; ok { mount.BindOptions = &swarmapi.Mount_BindOptions{Propagation: swarmapi.Mount_BindOptions_MountPropagation(mountPropagation)} } else if string(m.BindOptions.Propagation) != "" { return nil, fmt.Errorf("invalid MountPropagation: %q", m.BindOptions.Propagation) } } if m.VolumeOptions != nil { mount.VolumeOptions = &swarmapi.Mount_VolumeOptions{ NoCopy: m.VolumeOptions.NoCopy, Labels: m.VolumeOptions.Labels, } if m.VolumeOptions.DriverConfig != nil { mount.VolumeOptions.DriverConfig = &swarmapi.Driver{ Name: m.VolumeOptions.DriverConfig.Name, Options: m.VolumeOptions.DriverConfig.Options, } } } if m.TmpfsOptions != nil { mount.TmpfsOptions = &swarmapi.Mount_TmpfsOptions{ SizeBytes: m.TmpfsOptions.SizeBytes, Mode: m.TmpfsOptions.Mode, } } containerSpec.Mounts = append(containerSpec.Mounts, mount) } if c.Healthcheck != nil { containerSpec.Healthcheck = healthConfigToGRPC(c.Healthcheck) } return containerSpec, nil }
func TestOrchestratorRestartWindow(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) orchestrator := NewReplicatedOrchestrator(s) defer orchestrator.Stop() watch, cancel := state.Watch(s.WatchQueue() /*state.EventCreateTask{}, state.EventUpdateTask{}*/) defer cancel() // Create a service with two instances specified before the orchestrator is // started. This should result in two tasks when the orchestrator // starts up. err := s.Update(func(tx store.Tx) error { j1 := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 2, }, }, Task: api.TaskSpec{ Restart: &api.RestartPolicy{ Condition: api.RestartOnAny, Delay: ptypes.DurationProto(100 * time.Millisecond), MaxAttempts: 1, Window: ptypes.DurationProto(500 * time.Millisecond), }, }, }, } assert.NoError(t, store.CreateService(tx, j1)) return nil }) assert.NoError(t, err) // Start the orchestrator. go func() { assert.NoError(t, orchestrator.Run(ctx)) }() observedTask1 := watchTaskCreate(t, watch) assert.Equal(t, observedTask1.Status.State, api.TaskStateNew) assert.Equal(t, observedTask1.ServiceAnnotations.Name, "name1") observedTask2 := watchTaskCreate(t, watch) assert.Equal(t, observedTask2.Status.State, api.TaskStateNew) assert.Equal(t, observedTask2.ServiceAnnotations.Name, "name1") // Fail the first task. Confirm that it gets restarted. updatedTask1 := observedTask1.Copy() updatedTask1.Status = api.TaskStatus{State: api.TaskStateFailed} before := time.Now() err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.UpdateTask(tx, updatedTask1)) return nil }) assert.NoError(t, err) expectCommit(t, watch) expectTaskUpdate(t, watch) expectCommit(t, watch) expectTaskUpdate(t, watch) observedTask3 := watchTaskCreate(t, watch) expectCommit(t, watch) assert.Equal(t, observedTask3.Status.State, api.TaskStateNew) assert.Equal(t, observedTask3.DesiredState, api.TaskStateReady) assert.Equal(t, observedTask3.ServiceAnnotations.Name, "name1") observedTask4 := watchTaskUpdate(t, watch) after := time.Now() // At least 100 ms should have elapsed. Only check the lower bound, // because the system may be slow and it could have taken longer. if after.Sub(before) < 100*time.Millisecond { t.Fatal("restart delay should have elapsed") } assert.Equal(t, observedTask4.Status.State, api.TaskStateNew) assert.Equal(t, observedTask4.DesiredState, api.TaskStateRunning) assert.Equal(t, observedTask4.ServiceAnnotations.Name, "name1") // Fail the second task. Confirm that it gets restarted. updatedTask2 := observedTask2.Copy() updatedTask2.Status = api.TaskStatus{State: api.TaskStateFailed} before = time.Now() err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.UpdateTask(tx, updatedTask2)) return nil }) assert.NoError(t, err) expectCommit(t, watch) expectTaskUpdate(t, watch) expectCommit(t, watch) expectTaskUpdate(t, watch) observedTask5 := watchTaskCreate(t, watch) expectCommit(t, watch) assert.Equal(t, observedTask5.Status.State, api.TaskStateNew) assert.Equal(t, observedTask5.DesiredState, api.TaskStateReady) assert.Equal(t, observedTask5.ServiceAnnotations.Name, "name1") observedTask6 := watchTaskUpdate(t, watch) // task gets started after a delay expectCommit(t, watch) assert.Equal(t, observedTask6.Status.State, api.TaskStateNew) assert.Equal(t, observedTask6.DesiredState, api.TaskStateRunning) assert.Equal(t, observedTask6.ServiceAnnotations.Name, "name1") // Fail the first instance again. It should not be restarted. updatedTask1 = observedTask3.Copy() updatedTask1.Status = api.TaskStatus{State: api.TaskStateFailed} before = time.Now() err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.UpdateTask(tx, updatedTask1)) return nil }) assert.NoError(t, err) expectTaskUpdate(t, watch) expectCommit(t, watch) expectTaskUpdate(t, watch) expectCommit(t, watch) select { case <-watch: t.Fatal("got unexpected event") case <-time.After(200 * time.Millisecond): } time.Sleep(time.Second) // Fail the second instance again. It should get restarted because // enough time has elapsed since the last restarts. updatedTask2 = observedTask5.Copy() updatedTask2.Status = api.TaskStatus{State: api.TaskStateFailed} before = time.Now() err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.UpdateTask(tx, updatedTask2)) return nil }) assert.NoError(t, err) expectTaskUpdate(t, watch) expectCommit(t, watch) expectTaskUpdate(t, watch) observedTask7 := watchTaskCreate(t, watch) expectCommit(t, watch) assert.Equal(t, observedTask7.Status.State, api.TaskStateNew) assert.Equal(t, observedTask7.DesiredState, api.TaskStateReady) observedTask8 := watchTaskUpdate(t, watch) after = time.Now() // At least 100 ms should have elapsed. Only check the lower bound, // because the system may be slow and it could have taken longer. if after.Sub(before) < 100*time.Millisecond { t.Fatal("restart delay should have elapsed") } assert.Equal(t, observedTask8.Status.State, api.TaskStateNew) assert.Equal(t, observedTask8.DesiredState, api.TaskStateRunning) assert.Equal(t, observedTask8.ServiceAnnotations.Name, "name1") }
func TestOrchestratorRestartDelay(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) orchestrator := NewReplicatedOrchestrator(s) defer orchestrator.Stop() watch, cancel := state.Watch(s.WatchQueue() /*state.EventCreateTask{}, state.EventUpdateTask{}*/) defer cancel() // Create a service with two instances specified before the orchestrator is // started. This should result in two tasks when the orchestrator // starts up. err := s.Update(func(tx store.Tx) error { j1 := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{}, }, Restart: &api.RestartPolicy{ Condition: api.RestartOnAny, Delay: ptypes.DurationProto(100 * time.Millisecond), }, }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 2, }, }, }, } assert.NoError(t, store.CreateService(tx, j1)) return nil }) assert.NoError(t, err) // Start the orchestrator. go func() { assert.NoError(t, orchestrator.Run(ctx)) }() observedTask1 := watchTaskCreate(t, watch) assert.Equal(t, observedTask1.Status.State, api.TaskStateNew) assert.Equal(t, observedTask1.ServiceAnnotations.Name, "name1") observedTask2 := watchTaskCreate(t, watch) assert.Equal(t, observedTask2.Status.State, api.TaskStateNew) assert.Equal(t, observedTask2.ServiceAnnotations.Name, "name1") // Fail the first task. Confirm that it gets restarted. updatedTask1 := observedTask1.Copy() updatedTask1.Status = api.TaskStatus{State: api.TaskStateFailed} before := time.Now() err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.UpdateTask(tx, updatedTask1)) return nil }) assert.NoError(t, err) expectCommit(t, watch) expectTaskUpdate(t, watch) expectCommit(t, watch) expectTaskUpdate(t, watch) observedTask3 := watchTaskCreate(t, watch) expectCommit(t, watch) assert.Equal(t, observedTask3.Status.State, api.TaskStateNew) assert.Equal(t, observedTask3.DesiredState, api.TaskStateReady) assert.Equal(t, observedTask3.ServiceAnnotations.Name, "name1") observedTask4 := watchTaskUpdate(t, watch) after := time.Now() // At least 100 ms should have elapsed. Only check the lower bound, // because the system may be slow and it could have taken longer. if after.Sub(before) < 100*time.Millisecond { t.Fatalf("restart delay should have elapsed. Got: %v", after.Sub(before)) } assert.Equal(t, observedTask4.Status.State, api.TaskStateNew) assert.Equal(t, observedTask4.DesiredState, api.TaskStateRunning) assert.Equal(t, observedTask4.ServiceAnnotations.Name, "name1") }
// ServiceSpecToGRPC converts a ServiceSpec to a grpc ServiceSpec. func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) { name := s.Name if name == "" { name = namesgenerator.GetRandomName(0) } serviceNetworks := make([]*swarmapi.NetworkAttachmentConfig, 0, len(s.Networks)) for _, n := range s.Networks { serviceNetworks = append(serviceNetworks, &swarmapi.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) } taskNetworks := make([]*swarmapi.NetworkAttachmentConfig, 0, len(s.TaskTemplate.Networks)) for _, n := range s.TaskTemplate.Networks { taskNetworks = append(taskNetworks, &swarmapi.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) } spec := swarmapi.ServiceSpec{ Annotations: swarmapi.Annotations{ Name: name, Labels: s.Labels, }, Task: swarmapi.TaskSpec{ Resources: resourcesToGRPC(s.TaskTemplate.Resources), LogDriver: driverToGRPC(s.TaskTemplate.LogDriver), Networks: taskNetworks, ForceUpdate: s.TaskTemplate.ForceUpdate, }, Networks: serviceNetworks, } containerSpec, err := containerToGRPC(s.TaskTemplate.ContainerSpec) if err != nil { return swarmapi.ServiceSpec{}, err } spec.Task.Runtime = &swarmapi.TaskSpec_Container{Container: containerSpec} restartPolicy, err := restartPolicyToGRPC(s.TaskTemplate.RestartPolicy) if err != nil { return swarmapi.ServiceSpec{}, err } spec.Task.Restart = restartPolicy if s.TaskTemplate.Placement != nil { spec.Task.Placement = &swarmapi.Placement{ Constraints: s.TaskTemplate.Placement.Constraints, } } if s.UpdateConfig != nil { var failureAction swarmapi.UpdateConfig_FailureAction switch s.UpdateConfig.FailureAction { case types.UpdateFailureActionPause, "": failureAction = swarmapi.UpdateConfig_PAUSE case types.UpdateFailureActionContinue: failureAction = swarmapi.UpdateConfig_CONTINUE default: return swarmapi.ServiceSpec{}, fmt.Errorf("unrecongized update failure action %s", s.UpdateConfig.FailureAction) } spec.Update = &swarmapi.UpdateConfig{ Parallelism: s.UpdateConfig.Parallelism, Delay: *ptypes.DurationProto(s.UpdateConfig.Delay), FailureAction: failureAction, MaxFailureRatio: s.UpdateConfig.MaxFailureRatio, } if s.UpdateConfig.Monitor != 0 { spec.Update.Monitor = ptypes.DurationProto(s.UpdateConfig.Monitor) } } if s.EndpointSpec != nil { if s.EndpointSpec.Mode != "" && s.EndpointSpec.Mode != types.ResolutionModeVIP && s.EndpointSpec.Mode != types.ResolutionModeDNSRR { return swarmapi.ServiceSpec{}, fmt.Errorf("invalid resolution mode: %q", s.EndpointSpec.Mode) } spec.Endpoint = &swarmapi.EndpointSpec{} spec.Endpoint.Mode = swarmapi.EndpointSpec_ResolutionMode(swarmapi.EndpointSpec_ResolutionMode_value[strings.ToUpper(string(s.EndpointSpec.Mode))]) for _, portConfig := range s.EndpointSpec.Ports { spec.Endpoint.Ports = append(spec.Endpoint.Ports, &swarmapi.PortConfig{ Name: portConfig.Name, Protocol: swarmapi.PortConfig_Protocol(swarmapi.PortConfig_Protocol_value[strings.ToUpper(string(portConfig.Protocol))]), PublishMode: swarmapi.PortConfig_PublishMode(swarmapi.PortConfig_PublishMode_value[strings.ToUpper(string(portConfig.PublishMode))]), TargetPort: portConfig.TargetPort, PublishedPort: portConfig.PublishedPort, }) } } // Mode if s.Mode.Global != nil && s.Mode.Replicated != nil { return swarmapi.ServiceSpec{}, fmt.Errorf("cannot specify both replicated mode and global mode") } if s.Mode.Global != nil { spec.Mode = &swarmapi.ServiceSpec_Global{ Global: &swarmapi.GlobalService{}, } } else if s.Mode.Replicated != nil && s.Mode.Replicated.Replicas != nil { spec.Mode = &swarmapi.ServiceSpec_Replicated{ Replicated: &swarmapi.ReplicatedService{Replicas: *s.Mode.Replicated.Replicas}, } } else { spec.Mode = &swarmapi.ServiceSpec_Replicated{ Replicated: &swarmapi.ReplicatedService{Replicas: 1}, } } return spec, nil }
func TestTaskHistory(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) assert.NoError(t, s.Update(func(tx store.Tx) error { store.CreateCluster(tx, &api.Cluster{ ID: identity.NewID(), Spec: api.ClusterSpec{ Annotations: api.Annotations{ Name: store.DefaultClusterName, }, Orchestration: api.OrchestrationConfig{ TaskHistoryRetentionLimit: 2, }, }, }) return nil })) taskReaper := NewTaskReaper(s) defer taskReaper.Stop() orchestrator := NewReplicatedOrchestrator(s) defer orchestrator.Stop() watch, cancel := state.Watch(s.WatchQueue() /*state.EventCreateTask{}, state.EventUpdateTask{}*/) defer cancel() // Create a service with two instances specified before the orchestrator is // started. This should result in two tasks when the orchestrator // starts up. err := s.Update(func(tx store.Tx) error { j1 := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 2, }, }, Task: api.TaskSpec{ Restart: &api.RestartPolicy{ Condition: api.RestartOnAny, Delay: ptypes.DurationProto(0), }, }, }, } assert.NoError(t, store.CreateService(tx, j1)) return nil }) assert.NoError(t, err) // Start the orchestrator. go func() { assert.NoError(t, orchestrator.Run(ctx)) }() go taskReaper.Run() observedTask1 := watchTaskCreate(t, watch) assert.Equal(t, observedTask1.Status.State, api.TaskStateNew) assert.Equal(t, observedTask1.ServiceAnnotations.Name, "name1") observedTask2 := watchTaskCreate(t, watch) assert.Equal(t, observedTask2.Status.State, api.TaskStateNew) assert.Equal(t, observedTask2.ServiceAnnotations.Name, "name1") // Fail both tasks. They should both get restarted. updatedTask1 := observedTask1.Copy() updatedTask1.Status.State = api.TaskStateFailed updatedTask1.ServiceAnnotations = api.Annotations{Name: "original"} updatedTask2 := observedTask2.Copy() updatedTask2.Status.State = api.TaskStateFailed updatedTask2.ServiceAnnotations = api.Annotations{Name: "original"} err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.UpdateTask(tx, updatedTask1)) assert.NoError(t, store.UpdateTask(tx, updatedTask2)) return nil }) expectCommit(t, watch) expectTaskUpdate(t, watch) expectTaskUpdate(t, watch) expectCommit(t, watch) expectTaskUpdate(t, watch) observedTask3 := watchTaskCreate(t, watch) assert.Equal(t, observedTask3.Status.State, api.TaskStateNew) assert.Equal(t, observedTask3.ServiceAnnotations.Name, "name1") expectTaskUpdate(t, watch) observedTask4 := watchTaskCreate(t, watch) assert.Equal(t, observedTask4.Status.State, api.TaskStateNew) assert.Equal(t, observedTask4.ServiceAnnotations.Name, "name1") // Fail these replacement tasks. Since TaskHistory is set to 2, this // should cause the oldest tasks for each instance to get deleted. updatedTask3 := observedTask3.Copy() updatedTask3.Status.State = api.TaskStateFailed updatedTask4 := observedTask4.Copy() updatedTask4.Status.State = api.TaskStateFailed err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.UpdateTask(tx, updatedTask3)) assert.NoError(t, store.UpdateTask(tx, updatedTask4)) return nil }) deletedTask1 := watchTaskDelete(t, watch) deletedTask2 := watchTaskDelete(t, watch) assert.Equal(t, api.TaskStateFailed, deletedTask1.Status.State) assert.Equal(t, "original", deletedTask1.ServiceAnnotations.Name) assert.Equal(t, api.TaskStateFailed, deletedTask2.Status.State) assert.Equal(t, "original", deletedTask2.ServiceAnnotations.Name) var foundTasks []*api.Task s.View(func(tx store.ReadTx) { foundTasks, err = store.FindTasks(tx, store.All) }) assert.NoError(t, err) assert.Len(t, foundTasks, 4) }
func parseRestart(flags *pflag.FlagSet, spec *api.ServiceSpec) error { if flags.Changed("restart-condition") { condition, err := flags.GetString("restart-condition") if err != nil { return err } if spec.Task.Restart == nil { spec.Task.Restart = &api.RestartPolicy{} } switch condition { case "none": spec.Task.Restart.Condition = api.RestartOnNone case "failure": spec.Task.Restart.Condition = api.RestartOnFailure case "any": spec.Task.Restart.Condition = api.RestartOnAny default: return fmt.Errorf("invalid restart condition: %s", condition) } } if flags.Changed("restart-delay") { delay, err := flags.GetString("restart-delay") if err != nil { return err } delayDuration, err := time.ParseDuration(delay) if err != nil { return err } if spec.Task.Restart == nil { spec.Task.Restart = &api.RestartPolicy{} } spec.Task.Restart.Delay = ptypes.DurationProto(delayDuration) } if flags.Changed("restart-max-attempts") { attempts, err := flags.GetUint64("restart-max-attempts") if err != nil { return err } if spec.Task.Restart == nil { spec.Task.Restart = &api.RestartPolicy{} } spec.Task.Restart.MaxAttempts = attempts } if flags.Changed("restart-window") { window, err := flags.GetString("restart-window") if err != nil { return err } windowDelay, err := time.ParseDuration(window) if err != nil { return err } if spec.Task.Restart == nil { spec.Task.Restart = &api.RestartPolicy{} } spec.Task.Restart.Window = ptypes.DurationProto(windowDelay) } return nil }