func genTestControllerEnv(t *testing.T, task *api.Task) (context.Context, *MockAPIClient, exec.Controller, *containerConfig, func(t *testing.T)) { mocks := gomock.NewController(t) client := NewMockAPIClient(mocks) ctlr, err := newController(client, task) assert.NoError(t, err) config, err := newContainerConfig(task) assert.NoError(t, err) assert.NotNil(t, config) ctx := context.Background() // Put test name into log messages. Awesome! pc, _, _, ok := runtime.Caller(1) if ok { fn := runtime.FuncForPC(pc) ctx = log.WithLogger(ctx, log.L.WithField("test", fn.Name())) } ctx, cancel := context.WithCancel(ctx) return ctx, client, ctlr, config, func(t *testing.T) { cancel() mocks.Finish() } }
func (s *session) run(ctx context.Context, delay time.Duration, description *api.NodeDescription) { timer := time.NewTimer(delay) // delay before registering. defer timer.Stop() select { case <-timer.C: case <-ctx.Done(): return } if err := s.start(ctx, description); err != nil { select { case s.errs <- err: case <-s.closed: case <-ctx.Done(): } return } ctx = log.WithLogger(ctx, log.G(ctx).WithField("session.id", s.sessionID)) go runctx(ctx, s.closed, s.errs, s.heartbeat) go runctx(ctx, s.closed, s.errs, s.watch) go runctx(ctx, s.closed, s.errs, s.listen) go runctx(ctx, s.closed, s.errs, s.logSubscriptions) close(s.registered) }
// Init prepares the worker for assignments. func (w *worker) Init(ctx context.Context) error { w.mu.Lock() defer w.mu.Unlock() ctx = log.WithLogger(ctx, log.G(ctx).WithField("module", "worker")) // TODO(stevvooe): Start task cleanup process. // read the tasks from the database and start any task managers that may be needed. return w.db.Update(func(tx *bolt.Tx) error { return WalkTasks(tx, func(task *api.Task) error { if !TaskAssigned(tx, task.ID) { // NOTE(stevvooe): If tasks can survive worker restart, we need // to startup the controller and ensure they are removed. For // now, we can simply remove them from the database. if err := DeleteTask(tx, task.ID); err != nil { log.G(ctx).WithError(err).Errorf("error removing task %v", task.ID) } return nil } status, err := GetTaskStatus(tx, task.ID) if err != nil { log.G(ctx).WithError(err).Error("unable to read tasks status") return nil } task.Status = *status // merges the status into the task, ensuring we start at the right point. return w.startTask(ctx, tx, task) }) }) }
// serveListener serves a listener for local and non local connections. func (m *Manager) serveListener(ctx context.Context, errServe chan error, proto string, lis net.Listener) { ctx = log.WithLogger(ctx, log.G(ctx).WithFields( logrus.Fields{ "proto": lis.Addr().Network(), "addr": lis.Addr().String()})) if proto == "unix" { log.G(ctx).Info("Listening for local connections") // we need to disallow double closes because UnixListener.Close // can delete unix-socket file of newer listener. grpc calls // Close twice indeed: in Serve and in Stop. errServe <- m.localserver.Serve(&closeOnceListener{Listener: lis}) } else { log.G(ctx).Info("Listening for connections") errServe <- m.server.Serve(lis) } }
// AddManager adds node with Manager role(both agent and manager). func (c *testCluster) AddManager() error { // first node var n *testNode if len(c.nodes) == 0 { node, err := newTestNode("", "") if err != nil { return err } n = node } else { joinAddr, err := c.RandomManager().node.RemoteAPIAddr() if err != nil { return err } clusterInfo, err := c.api.ListClusters(context.Background(), &api.ListClustersRequest{}) if err != nil { return err } if len(clusterInfo.Clusters) == 0 { return fmt.Errorf("joining manager: there is no cluster created in storage") } node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Manager) if err != nil { return err } n = node } c.counter++ ctx := log.WithLogger(c.ctx, log.L.WithField("testnode", c.counter)) c.wg.Add(1) go func() { c.errs <- n.node.Start(ctx) c.wg.Done() }() select { case <-n.node.Ready(): case <-time.After(opsTimeout): return fmt.Errorf("node did not ready in time") } c.nodes[n.node.NodeID()] = n c.nodesOrder[n.node.NodeID()] = c.counter return nil }
func buildTestEnv(t *testing.T, task *api.Task) (context.Context, *MockController, func()) { var ( ctx, cancel = context.WithCancel(context.Background()) mocks = gomock.NewController(t) ctlr = NewMockController(mocks) ) // Put test name into log messages. Awesome! pc, _, _, ok := runtime.Caller(1) if ok { fn := runtime.FuncForPC(pc) ctx = log.WithLogger(ctx, log.L.WithField("test", fn.Name())) } return ctx, ctlr, func() { cancel() mocks.Finish() } }
func (s *session) run(ctx context.Context, delay time.Duration) { time.Sleep(delay) // delay before registering. if err := s.start(ctx); err != nil { select { case s.errs <- err: case <-s.closed: case <-ctx.Done(): } return } ctx = log.WithLogger(ctx, log.G(ctx).WithField("session.id", s.sessionID)) go runctx(ctx, s.closed, s.errs, s.heartbeat) go runctx(ctx, s.closed, s.errs, s.watch) go runctx(ctx, s.closed, s.errs, s.listen) close(s.registered) }
// Starts a node from a stopped state func (c *testCluster) StartNode(id string) error { n, ok := c.nodes[id] if !ok { return fmt.Errorf("set node role: node %s not found", id) } ctx := log.WithLogger(c.ctx, log.L.WithField("testnode", c.nodesOrder[id])) errCtx, cancel := context.WithCancel(context.Background()) done := make(chan error) defer cancel() defer close(done) c.wg.Add(2) go func() { c.errs <- n.node.Start(ctx) c.wg.Done() }() go func(n *node.Node) { err := n.Err(errCtx) select { case <-errCtx.Done(): default: done <- err } c.wg.Done() }(n.node) select { case <-n.node.Ready(): case err := <-done: return err case <-time.After(opsTimeout): return fmt.Errorf("node did not ready in time") } if n.node.NodeID() != id { return fmt.Errorf("restarted node does not have have the same ID") } return nil }
func (w *worker) newTaskManager(ctx context.Context, tx *bolt.Tx, task *api.Task) (*taskManager, error) { ctx = log.WithLogger(ctx, log.G(ctx).WithField("task.id", task.ID)) ctlr, status, err := exec.Resolve(ctx, task, w.executor) if err := w.updateTaskStatus(ctx, tx, task.ID, status); err != nil { log.G(ctx).WithError(err).Error("error updating task status after controller resolution") } if err != nil { log.G(ctx).Error("controller resolution failed") return nil, err } return newTaskManager(ctx, task, ctlr, statusReporterFunc(func(ctx context.Context, taskID string, status *api.TaskStatus) error { w.mu.RLock() defer w.mu.RUnlock() return w.db.Update(func(tx *bolt.Tx) error { return w.updateTaskStatus(ctx, tx, taskID, status) }) })), nil }
// serveListener serves a listener for local and non local connections. func (m *Manager) serveListener(ctx context.Context, lCh <-chan net.Listener) { var l net.Listener select { case l = <-lCh: case <-ctx.Done(): return } ctx = log.WithLogger(ctx, log.G(ctx).WithFields( logrus.Fields{ "proto": l.Addr().Network(), "addr": l.Addr().String(), })) if _, ok := l.(*net.TCPListener); !ok { log.G(ctx).Info("Listening for local connections") // we need to disallow double closes because UnixListener.Close // can delete unix-socket file of newer listener. grpc calls // Close twice indeed: in Serve and in Stop. m.errServe <- m.localserver.Serve(&closeOnceListener{Listener: l}) } else { log.G(ctx).Info("Listening for connections") m.errServe <- m.server.Serve(l) } }
// Starts a node from a stopped state func (c *testCluster) StartNode(id string) error { n, ok := c.nodes[id] if !ok { return fmt.Errorf("set node role: node %s not found", id) } ctx := log.WithLogger(c.ctx, log.L.WithField("testnode", c.nodesOrder[id])) c.wg.Add(1) go func() { c.errs <- n.node.Start(ctx) c.wg.Done() }() select { case <-n.node.Ready(): case <-time.After(opsTimeout): return fmt.Errorf("node did not ready in time") } if n.node.NodeID() != id { return fmt.Errorf("restarted node does not have have the same ID") } return nil }
func (tm *taskManager) run(ctx context.Context) { ctx, cancelAll := context.WithCancel(ctx) defer cancelAll() // cancel all child operations on exit. ctx = log.WithLogger(ctx, log.G(ctx).WithField("module", "taskmanager")) var ( opctx context.Context cancel context.CancelFunc run = make(chan struct{}, 1) statusq = make(chan *api.TaskStatus) errs = make(chan error) shutdown = tm.shutdown updated bool // true if the task was updated. ) defer func() { // closure picks up current value of cancel. if cancel != nil { cancel() } }() run <- struct{}{} // prime the pump for { select { case <-run: // always check for shutdown before running. select { case <-tm.shutdown: continue // ignore run request and handle shutdown case <-tm.closed: continue default: } opctx, cancel = context.WithCancel(ctx) // Several variables need to be snapshotted for the closure below. opcancel := cancel // fork for the closure running := tm.task.Copy() // clone the task before dispatch statusqLocal := statusq updatedLocal := updated // capture state of update for goroutine updated = false go runctx(ctx, tm.closed, errs, func(ctx context.Context) error { defer opcancel() if updatedLocal { // before we do anything, update the task for the controller. // always update the controller before running. if err := tm.ctlr.Update(opctx, running); err != nil { log.G(ctx).WithError(err).Error("updating task controller failed") return err } } status, err := exec.Do(opctx, running, tm.ctlr) if status != nil { // always report the status if we get one back. This // returns to the manager loop, then reports the status // upstream. select { case statusqLocal <- status: case <-ctx.Done(): // not opctx, since that may have been cancelled. } if err := tm.reporter.UpdateTaskStatus(ctx, running.ID, status); err != nil { log.G(ctx).WithError(err).Error("failed reporting status to agent") } } return err }) case err := <-errs: // This branch is always executed when an operations completes. The // goal is to decide whether or not we re-dispatch the operation. cancel = nil select { case <-tm.shutdown: shutdown = tm.shutdown // re-enable the shutdown branch continue // no dispatch if we are in shutdown. default: } switch err { case exec.ErrTaskNoop: if !updated { continue // wait till getting pumped via update. } case exec.ErrTaskRetry: // TODO(stevvooe): Add exponential backoff with random jitter // here. For now, this backoff is enough to keep the task // manager from running away with the CPU. time.AfterFunc(time.Second, func() { errs <- nil // repump this branch, with no err }) continue case nil, context.Canceled, context.DeadlineExceeded: // no log in this case default: log.G(ctx).WithError(err).Error("task operation failed") } select { case run <- struct{}{}: default: } case status := <-statusq: tm.task.Status = *status case task := <-tm.updateq: if equality.TasksEqualStable(task, tm.task) { continue // ignore the update } if task.ID != tm.task.ID { log.G(ctx).WithField("task.update.id", task.ID).Error("received update for incorrect task") continue } if task.DesiredState < tm.task.DesiredState { log.G(ctx).WithField("task.update.desiredstate", task.DesiredState). Error("ignoring task update with invalid desired state") continue } task = task.Copy() task.Status = tm.task.Status // overwrite our status, as it is canonical. tm.task = task updated = true // we have accepted the task update if cancel != nil { cancel() // cancel outstanding if necessary. } else { // If this channel op fails, it means there is already a // message un the run queue. select { case run <- struct{}{}: default: } } case <-shutdown: if cancel != nil { // cancel outstanding operation. cancel() // subtle: after a cancellation, we want to avoid busy wait // here. this gets renabled in the errs branch and we'll come // back around and try shutdown again. shutdown = nil // turn off this branch until op proceeds continue // wait until operation actually exits. } // TODO(stevvooe): This should be left for the repear. // make an attempt at removing. this is best effort. any errors will be // retried by the reaper later. if err := tm.ctlr.Remove(ctx); err != nil { log.G(ctx).WithError(err).WithField("task.id", tm.task.ID).Error("remove task failed") } if err := tm.ctlr.Close(); err != nil { log.G(ctx).WithError(err).Error("error closing controller") } // disable everything, and prepare for closing. statusq = nil errs = nil shutdown = nil close(tm.closed) case <-tm.closed: return case <-ctx.Done(): return } } }
func reconcileTaskState(ctx context.Context, w *worker, assignments []*api.AssignmentChange, fullSnapshot bool) error { var ( updatedTasks []*api.Task removedTasks []*api.Task ) for _, a := range assignments { if t := a.Assignment.GetTask(); t != nil { switch a.Action { case api.AssignmentChange_AssignmentActionUpdate: updatedTasks = append(updatedTasks, t) case api.AssignmentChange_AssignmentActionRemove: removedTasks = append(removedTasks, t) } } } log.G(ctx).WithFields(logrus.Fields{ "len(updatedTasks)": len(updatedTasks), "len(removedTasks)": len(removedTasks), }).Debug("(*worker).reconcileTaskState") tx, err := w.db.Begin(true) if err != nil { log.G(ctx).WithError(err).Error("failed starting transaction against task database") return err } defer tx.Rollback() assigned := map[string]struct{}{} for _, task := range updatedTasks { log.G(ctx).WithFields( logrus.Fields{ "task.id": task.ID, "task.desiredstate": task.DesiredState}).Debug("assigned") if err := PutTask(tx, task); err != nil { return err } if err := SetTaskAssignment(tx, task.ID, true); err != nil { return err } if mgr, ok := w.taskManagers[task.ID]; ok { if err := mgr.Update(ctx, task); err != nil && err != ErrClosed { log.G(ctx).WithError(err).Error("failed updating assigned task") } } else { // we may have still seen the task, let's grab the status from // storage and replace it with our status, if we have it. status, err := GetTaskStatus(tx, task.ID) if err != nil { if err != errTaskUnknown { return err } // never seen before, register the provided status if err := PutTaskStatus(tx, task.ID, &task.Status); err != nil { return err } } else { task.Status = *status } w.startTask(ctx, tx, task) } assigned[task.ID] = struct{}{} } closeManager := func(tm *taskManager) { // when a task is no longer assigned, we shutdown the task manager for // it and leave cleanup to the sweeper. if err := tm.Close(); err != nil { log.G(ctx).WithError(err).Error("error closing task manager") } } removeTaskAssignment := func(taskID string) error { ctx := log.WithLogger(ctx, log.G(ctx).WithField("task.id", taskID)) if err := SetTaskAssignment(tx, taskID, false); err != nil { log.G(ctx).WithError(err).Error("error setting task assignment in database") } return err } // If this was a complete set of assignments, we're going to remove all the remaining // tasks. if fullSnapshot { for id, tm := range w.taskManagers { if _, ok := assigned[id]; ok { continue } err := removeTaskAssignment(id) if err == nil { delete(w.taskManagers, id) go closeManager(tm) } } } else { // If this was an incremental set of assignments, we're going to remove only the tasks // in the removed set for _, task := range removedTasks { err := removeTaskAssignment(task.ID) if err != nil { continue } tm, ok := w.taskManagers[task.ID] if ok { delete(w.taskManagers, task.ID) go closeManager(tm) } } } return tx.Commit() }
// Run starts all manager sub-systems and the gRPC server at the configured // address. // The call never returns unless an error occurs or `Stop()` is called. // // TODO(aluzzardi): /!\ This function is *way* too complex. /!\ // It needs to be split into smaller manageable functions. func (m *Manager) Run(parent context.Context) error { ctx, ctxCancel := context.WithCancel(parent) defer ctxCancel() // Harakiri. go func() { select { case <-ctx.Done(): case <-m.stopped: ctxCancel() } }() leadershipCh, cancel := m.RaftNode.SubscribeLeadership() defer cancel() go func() { for leadershipEvent := range leadershipCh { // read out and discard all of the messages when we've stopped // don't acquire the mutex yet. if stopped is closed, we don't need // this stops this loop from starving Run()'s attempt to Lock select { case <-m.stopped: continue default: // do nothing, we're not stopped } // we're not stopping so NOW acquire the mutex m.mu.Lock() newState := leadershipEvent.(raft.LeadershipState) if newState == raft.IsLeader { s := m.RaftNode.MemoryStore() rootCA := m.config.SecurityConfig.RootCA() nodeID := m.config.SecurityConfig.ClientTLSCreds.NodeID() raftCfg := raft.DefaultRaftConfig() raftCfg.ElectionTick = uint32(m.RaftNode.Config.ElectionTick) raftCfg.HeartbeatTick = uint32(m.RaftNode.Config.HeartbeatTick) clusterID := m.config.SecurityConfig.ClientTLSCreds.Organization() initialCAConfig := ca.DefaultCAConfig() initialCAConfig.ExternalCAs = m.config.ExternalCAs s.Update(func(tx store.Tx) error { // Add a default cluster object to the // store. Don't check the error because // we expect this to fail unless this // is a brand new cluster. store.CreateCluster(tx, &api.Cluster{ ID: clusterID, Spec: api.ClusterSpec{ Annotations: api.Annotations{ Name: store.DefaultClusterName, }, Orchestration: api.OrchestrationConfig{ TaskHistoryRetentionLimit: defaultTaskHistoryRetentionLimit, }, Dispatcher: api.DispatcherConfig{ HeartbeatPeriod: ptypes.DurationProto(dispatcher.DefaultHeartBeatPeriod), }, Raft: raftCfg, CAConfig: initialCAConfig, }, RootCA: api.RootCA{ CAKey: rootCA.Key, CACert: rootCA.Cert, CACertHash: rootCA.Digest.String(), JoinTokens: api.JoinTokens{ Worker: ca.GenerateJoinToken(rootCA), Manager: ca.GenerateJoinToken(rootCA), }, }, }) // Add Node entry for ourself, if one // doesn't exist already. store.CreateNode(tx, &api.Node{ ID: nodeID, Certificate: api.Certificate{ CN: nodeID, Role: api.NodeRoleManager, Status: api.IssuanceStatus{ State: api.IssuanceStateIssued, }, }, Spec: api.NodeSpec{ Role: api.NodeRoleManager, Membership: api.NodeMembershipAccepted, }, }) return nil }) // Attempt to rotate the key-encrypting-key of the root CA key-material err := m.rotateRootCAKEK(ctx, clusterID) if err != nil { log.G(ctx).WithError(err).Error("root key-encrypting-key rotation failed") } m.replicatedOrchestrator = orchestrator.NewReplicatedOrchestrator(s) m.globalOrchestrator = orchestrator.NewGlobalOrchestrator(s) m.taskReaper = orchestrator.NewTaskReaper(s) m.scheduler = scheduler.New(s) m.keyManager = keymanager.New(m.RaftNode.MemoryStore(), keymanager.DefaultConfig()) // TODO(stevvooe): Allocate a context that can be used to // shutdown underlying manager processes when leadership is // lost. m.allocator, err = allocator.New(s) if err != nil { log.G(ctx).WithError(err).Error("failed to create allocator") // TODO(stevvooe): It doesn't seem correct here to fail // creating the allocator but then use it anyway. } if m.keyManager != nil { go func(keyManager *keymanager.KeyManager) { if err := keyManager.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("keymanager failed with an error") } }(m.keyManager) } go func(d *dispatcher.Dispatcher) { if err := d.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("Dispatcher exited with an error") } }(m.Dispatcher) go func(server *ca.Server) { if err := server.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("CA signer exited with an error") } }(m.caserver) // Start all sub-components in separate goroutines. // TODO(aluzzardi): This should have some kind of error handling so that // any component that goes down would bring the entire manager down. if m.allocator != nil { go func(allocator *allocator.Allocator) { if err := allocator.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("allocator exited with an error") } }(m.allocator) } go func(scheduler *scheduler.Scheduler) { if err := scheduler.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("scheduler exited with an error") } }(m.scheduler) go func(taskReaper *orchestrator.TaskReaper) { taskReaper.Run() }(m.taskReaper) go func(orchestrator *orchestrator.ReplicatedOrchestrator) { if err := orchestrator.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("replicated orchestrator exited with an error") } }(m.replicatedOrchestrator) go func(globalOrchestrator *orchestrator.GlobalOrchestrator) { if err := globalOrchestrator.Run(ctx); err != nil { log.G(ctx).WithError(err).Error("global orchestrator exited with an error") } }(m.globalOrchestrator) } else if newState == raft.IsFollower { m.Dispatcher.Stop() m.caserver.Stop() if m.allocator != nil { m.allocator.Stop() m.allocator = nil } m.replicatedOrchestrator.Stop() m.replicatedOrchestrator = nil m.globalOrchestrator.Stop() m.globalOrchestrator = nil m.taskReaper.Stop() m.taskReaper = nil m.scheduler.Stop() m.scheduler = nil if m.keyManager != nil { m.keyManager.Stop() m.keyManager = nil } } m.mu.Unlock() } }() proxyOpts := []grpc.DialOption{ grpc.WithTimeout(5 * time.Second), grpc.WithTransportCredentials(m.config.SecurityConfig.ClientTLSCreds), } cs := raftpicker.NewConnSelector(m.RaftNode, proxyOpts...) m.connSelector = cs // We need special connSelector for controlapi because it provides automatic // leader tracking. // Other APIs are using connSelector which errors out on leader change, but // allows to react quickly to reelections. controlAPIProxyOpts := []grpc.DialOption{ grpc.WithBackoffMaxDelay(time.Second), grpc.WithTransportCredentials(m.config.SecurityConfig.ClientTLSCreds), } controlAPIConnSelector := hackpicker.NewConnSelector(m.RaftNode, controlAPIProxyOpts...) authorize := func(ctx context.Context, roles []string) error { // Authorize the remote roles, ensure they can only be forwarded by managers _, err := ca.AuthorizeForwardedRoleAndOrg(ctx, roles, []string{ca.ManagerRole}, m.config.SecurityConfig.ClientTLSCreds.Organization()) return err } baseControlAPI := controlapi.NewServer(m.RaftNode.MemoryStore(), m.RaftNode, m.config.SecurityConfig.RootCA()) healthServer := health.NewHealthServer() authenticatedControlAPI := api.NewAuthenticatedWrapperControlServer(baseControlAPI, authorize) authenticatedDispatcherAPI := api.NewAuthenticatedWrapperDispatcherServer(m.Dispatcher, authorize) authenticatedCAAPI := api.NewAuthenticatedWrapperCAServer(m.caserver, authorize) authenticatedNodeCAAPI := api.NewAuthenticatedWrapperNodeCAServer(m.caserver, authorize) authenticatedRaftAPI := api.NewAuthenticatedWrapperRaftServer(m.RaftNode, authorize) authenticatedHealthAPI := api.NewAuthenticatedWrapperHealthServer(healthServer, authorize) authenticatedRaftMembershipAPI := api.NewAuthenticatedWrapperRaftMembershipServer(m.RaftNode, authorize) proxyDispatcherAPI := api.NewRaftProxyDispatcherServer(authenticatedDispatcherAPI, cs, m.RaftNode, ca.WithMetadataForwardTLSInfo) proxyCAAPI := api.NewRaftProxyCAServer(authenticatedCAAPI, cs, m.RaftNode, ca.WithMetadataForwardTLSInfo) proxyNodeCAAPI := api.NewRaftProxyNodeCAServer(authenticatedNodeCAAPI, cs, m.RaftNode, ca.WithMetadataForwardTLSInfo) proxyRaftMembershipAPI := api.NewRaftProxyRaftMembershipServer(authenticatedRaftMembershipAPI, cs, m.RaftNode, ca.WithMetadataForwardTLSInfo) // localProxyControlAPI is a special kind of proxy. It is only wired up // to receive requests from a trusted local socket, and these requests // don't use TLS, therefore the requests it handles locally should // bypass authorization. When it proxies, it sends them as requests from // this manager rather than forwarded requests (it has no TLS // information to put in the metadata map). forwardAsOwnRequest := func(ctx context.Context) (context.Context, error) { return ctx, nil } localProxyControlAPI := api.NewRaftProxyControlServer(baseControlAPI, controlAPIConnSelector, m.RaftNode, forwardAsOwnRequest) // Everything registered on m.server should be an authenticated // wrapper, or a proxy wrapping an authenticated wrapper! api.RegisterCAServer(m.server, proxyCAAPI) api.RegisterNodeCAServer(m.server, proxyNodeCAAPI) api.RegisterRaftServer(m.server, authenticatedRaftAPI) api.RegisterHealthServer(m.server, authenticatedHealthAPI) api.RegisterRaftMembershipServer(m.server, proxyRaftMembershipAPI) api.RegisterControlServer(m.localserver, localProxyControlAPI) api.RegisterControlServer(m.server, authenticatedControlAPI) api.RegisterDispatcherServer(m.server, proxyDispatcherAPI) errServe := make(chan error, 2) for proto, l := range m.listeners { go func(proto string, lis net.Listener) { ctx := log.WithLogger(ctx, log.G(ctx).WithFields( logrus.Fields{ "proto": lis.Addr().Network(), "addr": lis.Addr().String()})) if proto == "unix" { log.G(ctx).Info("Listening for local connections") // we need to disallow double closes because UnixListener.Close // can delete unix-socket file of newer listener. grpc calls // Close twice indeed: in Serve and in Stop. errServe <- m.localserver.Serve(&closeOnceListener{Listener: lis}) } else { log.G(ctx).Info("Listening for connections") errServe <- m.server.Serve(lis) } }(proto, l) } // Set the raft server as serving for the health server healthServer.SetServingStatus("Raft", api.HealthCheckResponse_SERVING) if err := m.RaftNode.JoinAndStart(); err != nil { for _, lis := range m.listeners { lis.Close() } return fmt.Errorf("can't initialize raft node: %v", err) } close(m.started) go func() { err := m.RaftNode.Run(ctx) if err != nil { log.G(ctx).Error(err) m.Stop(ctx) } }() if err := raft.WaitForLeader(ctx, m.RaftNode); err != nil { m.server.Stop() return err } c, err := raft.WaitForCluster(ctx, m.RaftNode) if err != nil { m.server.Stop() return err } raftConfig := c.Spec.Raft if int(raftConfig.ElectionTick) != m.RaftNode.Config.ElectionTick { log.G(ctx).Warningf("election tick value (%ds) is different from the one defined in the cluster config (%vs), the cluster may be unstable", m.RaftNode.Config.ElectionTick, raftConfig.ElectionTick) } if int(raftConfig.HeartbeatTick) != m.RaftNode.Config.HeartbeatTick { log.G(ctx).Warningf("heartbeat tick value (%ds) is different from the one defined in the cluster config (%vs), the cluster may be unstable", m.RaftNode.Config.HeartbeatTick, raftConfig.HeartbeatTick) } // wait for an error in serving. err = <-errServe select { // check to see if stopped was posted to. if so, we're in the process of // stopping, or done and that's why we got the error. if stopping is // deliberate, stopped will ALWAYS be closed before the error is trigger, // so this path will ALWAYS be taken if the stop was deliberate case <-m.stopped: // shutdown was requested, do not return an error // but first, we wait to acquire a mutex to guarantee that stopping is // finished. as long as we acquire the mutex BEFORE we return, we know // that stopping is stopped. m.mu.Lock() m.mu.Unlock() return nil // otherwise, we'll get something from errServe, which indicates that an // error in serving has actually occurred and this isn't a planned shutdown default: return err } }
// Run is the main loop for a Raft node, it goes along the state machine, // acting on the messages received from other Raft nodes in the cluster. // // Before running the main loop, it first starts the raft node based on saved // cluster state. If no saved state exists, it starts a single-node cluster. func (n *Node) Run(ctx context.Context) error { ctx = log.WithLogger(ctx, logrus.WithField("raft_id", fmt.Sprintf("%x", n.Config.ID))) ctx, cancel := context.WithCancel(ctx) // nodeRemoved indicates that node was stopped due its removal. nodeRemoved := false defer func() { cancel() n.stop(ctx) if nodeRemoved { // Move WAL and snapshot out of the way, since // they are no longer usable. if err := n.moveWALAndSnap(); err != nil { log.G(ctx).WithError(err).Error("failed to move wal after node removal") } } n.done() }() wasLeader := false for { select { case <-n.ticker.C(): n.raftNode.Tick() n.cluster.Tick() case rd := <-n.raftNode.Ready(): raftConfig := DefaultRaftConfig() n.memoryStore.View(func(readTx store.ReadTx) { clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) if err == nil && len(clusters) == 1 { raftConfig = clusters[0].Spec.Raft } }) // Save entries to storage if err := n.saveToStorage(&raftConfig, rd.HardState, rd.Entries, rd.Snapshot); err != nil { log.G(ctx).WithError(err).Error("failed to save entries to storage") } if len(rd.Messages) != 0 { // Send raft messages to peers if err := n.send(ctx, rd.Messages); err != nil { log.G(ctx).WithError(err).Error("failed to send message to members") } } // Apply snapshot to memory store. The snapshot // was applied to the raft store in // saveToStorage. if !raft.IsEmptySnap(rd.Snapshot) { // Load the snapshot data into the store if err := n.restoreFromSnapshot(rd.Snapshot.Data, false); err != nil { log.G(ctx).WithError(err).Error("failed to restore from snapshot") } n.appliedIndex = rd.Snapshot.Metadata.Index n.snapshotIndex = rd.Snapshot.Metadata.Index n.confState = rd.Snapshot.Metadata.ConfState } // If we cease to be the leader, we must cancel any // proposals that are currently waiting for a quorum to // acknowledge them. It is still possible for these to // become committed, but if that happens we will apply // them as any follower would. // It is important that we cancel these proposals before // calling processCommitted, so processCommitted does // not deadlock. if rd.SoftState != nil { if wasLeader && rd.SoftState.RaftState != raft.StateLeader { wasLeader = false if atomic.LoadUint32(&n.signalledLeadership) == 1 { atomic.StoreUint32(&n.signalledLeadership, 0) n.leadershipBroadcast.Publish(IsFollower) } // It is important that we set n.signalledLeadership to 0 // before calling n.wait.cancelAll. When a new raft // request is registered, it checks n.signalledLeadership // afterwards, and cancels the registration if it is 0. // If cancelAll was called first, this call might run // before the new request registers, but // signalledLeadership would be set after the check. // Setting signalledLeadership before calling cancelAll // ensures that if a new request is registered during // this transition, it will either be cancelled by // cancelAll, or by its own check of signalledLeadership. n.wait.cancelAll() } else if !wasLeader && rd.SoftState.RaftState == raft.StateLeader { wasLeader = true } } // Process committed entries for _, entry := range rd.CommittedEntries { if err := n.processCommitted(ctx, entry); err != nil { log.G(ctx).WithError(err).Error("failed to process committed entries") } } // Trigger a snapshot every once in awhile if n.snapshotInProgress == nil && raftConfig.SnapshotInterval > 0 && n.appliedIndex-n.snapshotIndex >= raftConfig.SnapshotInterval { n.doSnapshot(ctx, raftConfig) } if wasLeader && atomic.LoadUint32(&n.signalledLeadership) != 1 { // If all the entries in the log have become // committed, broadcast our leadership status. if n.caughtUp() { atomic.StoreUint32(&n.signalledLeadership, 1) n.leadershipBroadcast.Publish(IsLeader) } } // Advance the state machine n.raftNode.Advance() // On the first startup, or if we are the only // registered member after restoring from the state, // campaign to be the leader. if n.campaignWhenAble { members := n.cluster.Members() if len(members) >= 1 { n.campaignWhenAble = false } if len(members) == 1 && members[n.Config.ID] != nil { if err := n.raftNode.Campaign(ctx); err != nil { panic("raft: cannot campaign to be the leader on node restore") } } } case snapshotIndex := <-n.snapshotInProgress: if snapshotIndex > n.snapshotIndex { n.snapshotIndex = snapshotIndex } n.snapshotInProgress = nil case <-n.removeRaftCh: nodeRemoved = true // If the node was removed from other members, // send back an error to the caller to start // the shutdown process. return ErrMemberRemoved case <-ctx.Done(): return nil } } }
// Run runs dispatcher tasks which should be run on leader dispatcher. // Dispatcher can be stopped with cancelling ctx or calling Stop(). func (d *Dispatcher) Run(ctx context.Context) error { d.mu.Lock() if d.isRunning() { d.mu.Unlock() return fmt.Errorf("dispatcher is already running") } logger := log.G(ctx).WithField("module", "dispatcher") ctx = log.WithLogger(ctx, logger) if err := d.markNodesUnknown(ctx); err != nil { logger.Errorf(`failed to move all nodes to "unknown" state: %v`, err) } configWatcher, cancel, err := store.ViewAndWatch( d.store, func(readTx store.ReadTx) error { clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) if err != nil { return err } if err == nil && len(clusters) == 1 { heartbeatPeriod, err := ptypes.Duration(clusters[0].Spec.Dispatcher.HeartbeatPeriod) if err == nil && heartbeatPeriod > 0 { d.config.HeartbeatPeriod = heartbeatPeriod } if clusters[0].NetworkBootstrapKeys != nil { d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys } } return nil }, state.EventUpdateCluster{}, ) if err != nil { d.mu.Unlock() return err } defer cancel() d.ctx, d.cancel = context.WithCancel(ctx) d.mu.Unlock() publishManagers := func() { mgrs := getWeightedPeers(d.cluster) sort.Sort(weightedPeerByNodeID(mgrs)) d.mu.Lock() if reflect.DeepEqual(mgrs, d.lastSeenManagers) { d.mu.Unlock() return } d.lastSeenManagers = mgrs d.mu.Unlock() d.mgrQueue.Publish(mgrs) } publishManagers() publishTicker := time.NewTicker(1 * time.Second) defer publishTicker.Stop() batchTimer := time.NewTimer(maxBatchInterval) defer batchTimer.Stop() for { select { case <-publishTicker.C: publishManagers() case <-d.processTaskUpdatesTrigger: d.processTaskUpdates() batchTimer.Reset(maxBatchInterval) case <-batchTimer.C: d.processTaskUpdates() batchTimer.Reset(maxBatchInterval) case v := <-configWatcher: cluster := v.(state.EventUpdateCluster) d.mu.Lock() if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil { // ignore error, since Spec has passed validation before heartbeatPeriod, _ := ptypes.Duration(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod) if heartbeatPeriod != d.config.HeartbeatPeriod { // only call d.nodes.updatePeriod when heartbeatPeriod changes d.config.HeartbeatPeriod = heartbeatPeriod d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier) } } d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys d.mu.Unlock() d.keyMgrQueue.Publish(struct{}{}) case <-d.ctx.Done(): return nil } } }
// Run runs the CA signer main loop. // The CA signer can be stopped with cancelling ctx or calling Stop(). func (s *Server) Run(ctx context.Context) error { s.mu.Lock() if s.isRunning() { s.mu.Unlock() return fmt.Errorf("CA signer is already running") } s.wg.Add(1) s.mu.Unlock() defer s.wg.Done() logger := log.G(ctx).WithField("module", "ca") ctx = log.WithLogger(ctx, logger) // Run() should never be called twice, but just in case, we're // attempting to close the started channel in a safe way select { case <-s.started: return fmt.Errorf("CA server cannot be started more than once") default: close(s.started) } // Retrieve the channels to keep track of changes in the cluster // Retrieve all the currently registered nodes var nodes []*api.Node updates, cancel, err := store.ViewAndWatch( s.store, func(readTx store.ReadTx) error { clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) if err != nil { return err } if len(clusters) != 1 { return fmt.Errorf("could not find cluster object") } s.updateCluster(ctx, clusters[0]) nodes, err = store.FindNodes(readTx, store.All) return err }, state.EventCreateNode{}, state.EventUpdateNode{}, state.EventUpdateCluster{}, ) // Do this after updateCluster has been called, so isRunning never // returns true without joinTokens being set correctly. s.mu.Lock() s.ctx, s.cancel = context.WithCancel(ctx) s.mu.Unlock() if err != nil { log.G(ctx).WithFields(logrus.Fields{ "method": "(*Server).Run", }).WithError(err).Errorf("snapshot store view failed") return err } defer cancel() // We might have missed some updates if there was a leader election, // so let's pick up the slack. if err := s.reconcileNodeCertificates(ctx, nodes); err != nil { // We don't return here because that means the Run loop would // never run. Log an error instead. log.G(ctx).WithFields(logrus.Fields{ "method": "(*Server).Run", }).WithError(err).Errorf("error attempting to reconcile certificates") } // Watch for new nodes being created, new nodes being updated, and changes // to the cluster for { select { case event := <-updates: switch v := event.(type) { case state.EventCreateNode: s.evaluateAndSignNodeCert(ctx, v.Node) case state.EventUpdateNode: // If this certificate is already at a final state // no need to evaluate and sign it. if !isFinalState(v.Node.Certificate.Status) { s.evaluateAndSignNodeCert(ctx, v.Node) } case state.EventUpdateCluster: s.updateCluster(ctx, v.Cluster) } case <-ctx.Done(): return ctx.Err() case <-s.ctx.Done(): return nil } } }
func (a *Agent) run(ctx context.Context) { ctx, cancel := context.WithCancel(ctx) defer cancel() defer close(a.closed) // full shutdown. ctx = log.WithLogger(ctx, log.G(ctx).WithField("module", "agent")) log.G(ctx).Debugf("(*Agent).run") defer log.G(ctx).Debugf("(*Agent).run exited") var ( backoff time.Duration session = newSession(ctx, a, backoff) // start the initial session registered = session.registered ready = a.ready // first session ready sessionq chan sessionOperation ) if err := a.worker.Init(ctx); err != nil { log.G(ctx).WithError(err).Error("worker initialization failed") a.err = err return // fatal? } // setup a reliable reporter to call back to us. reporter := newStatusReporter(ctx, a) defer reporter.Close() a.worker.Listen(ctx, reporter) for { select { case operation := <-sessionq: operation.response <- operation.fn(session) case msg := <-session.tasks: if err := a.worker.Assign(ctx, msg.Tasks); err != nil { log.G(ctx).WithError(err).Error("task assignment failed") } case msg := <-session.messages: if err := a.handleSessionMessage(ctx, msg); err != nil { log.G(ctx).WithError(err).Error("session message handler failed") } case <-registered: log.G(ctx).Debugln("agent: registered") if ready != nil { close(ready) } ready = nil registered = nil // we only care about this once per session backoff = 0 // reset backoff sessionq = a.sessionq case err := <-session.errs: // TODO(stevvooe): This may actually block if a session is closed // but no error was sent. Session.close must only be called here // for this to work. if err != nil { log.G(ctx).WithError(err).Error("agent: session failed") backoff = initialSessionFailureBackoff + 2*backoff if backoff > maxSessionFailureBackoff { backoff = maxSessionFailureBackoff } } if err := session.close(); err != nil { log.G(ctx).WithError(err).Error("agent: closing session failed") } sessionq = nil // if we're here before <-registered, do nothing for that event registered = nil // Bounce the connection. if a.config.Picker != nil { a.config.Picker.Reset() } case <-session.closed: log.G(ctx).Debugf("agent: rebuild session") // select a session registration delay from backoff range. delay := time.Duration(rand.Int63n(int64(backoff))) session = newSession(ctx, a, delay) registered = session.registered sessionq = a.sessionq case <-a.stopped: // TODO(stevvooe): Wait on shutdown and cleanup. May need to pump // this loop a few times. return case <-ctx.Done(): if a.err == nil { a.err = ctx.Err() } return } } }
func (n *Node) run(ctx context.Context) (err error) { defer func() { n.err = err close(n.closed) }() ctx, cancel := context.WithCancel(ctx) defer cancel() ctx = log.WithLogger(ctx, log.G(ctx).WithField("module", "node")) go func() { select { case <-ctx.Done(): case <-n.stopped: cancel() } }() // NOTE: When this node is created by NewNode(), our nodeID is set if // n.loadCertificates() succeeded in loading TLS credentials. if n.config.JoinAddr == "" && n.nodeID == "" { if err := n.bootstrapCA(); err != nil { return err } } if n.config.JoinAddr != "" || n.config.ForceNewCluster { n.remotes = newPersistentRemotes(filepath.Join(n.config.StateDir, stateFilename)) if n.config.JoinAddr != "" { n.remotes.Observe(api.Peer{Addr: n.config.JoinAddr}, 1) } } // Obtain new certs and setup TLS certificates renewal for this node: // - We call LoadOrCreateSecurityConfig which blocks until a valid certificate has been issued // - We retrieve the nodeID from LoadOrCreateSecurityConfig through the info channel. This allows // us to display the ID before the certificate gets issued (for potential approval). // - We wait for LoadOrCreateSecurityConfig to finish since we need a certificate to operate. // - Given a valid certificate, spin a renewal go-routine that will ensure that certificates stay // up to date. issueResponseChan := make(chan api.IssueNodeCertificateResponse, 1) go func() { select { case <-ctx.Done(): case resp := <-issueResponseChan: logrus.Debugf("Requesting certificate for NodeID: %v", resp.NodeID) n.Lock() n.nodeID = resp.NodeID n.nodeMembership = resp.NodeMembership n.Unlock() close(n.certificateRequested) } }() certDir := filepath.Join(n.config.StateDir, "certificates") securityConfig, err := ca.LoadOrCreateSecurityConfig(ctx, certDir, n.config.JoinToken, ca.ManagerRole, picker.NewPicker(n.remotes), issueResponseChan) if err != nil { return err } taskDBPath := filepath.Join(n.config.StateDir, "worker/tasks.db") if err := os.MkdirAll(filepath.Dir(taskDBPath), 0777); err != nil { return err } db, err := bolt.Open(taskDBPath, 0666, nil) if err != nil { return err } defer db.Close() if err := n.loadCertificates(); err != nil { return err } forceCertRenewal := make(chan struct{}) go func() { for { select { case <-ctx.Done(): return case apirole := <-n.roleChangeReq: n.Lock() lastRole := n.role role := ca.AgentRole if apirole == api.NodeRoleManager { role = ca.ManagerRole } if lastRole == role { n.Unlock() continue } // switch role to agent immediately to shutdown manager early if role == ca.AgentRole { n.role = role n.roleCond.Broadcast() } n.Unlock() select { case forceCertRenewal <- struct{}{}: case <-ctx.Done(): return } } } }() updates := ca.RenewTLSConfig(ctx, securityConfig, certDir, picker.NewPicker(n.remotes), forceCertRenewal) go func() { for { select { case certUpdate := <-updates: if certUpdate.Err != nil { logrus.Warnf("error renewing TLS certificate: %v", certUpdate.Err) continue } n.Lock() n.role = certUpdate.Role n.roleCond.Broadcast() n.Unlock() case <-ctx.Done(): return } } }() role := n.role managerReady := make(chan struct{}) agentReady := make(chan struct{}) var managerErr error var agentErr error var wg sync.WaitGroup wg.Add(2) go func() { managerErr = n.runManager(ctx, securityConfig, managerReady) // store err and loop wg.Done() cancel() }() go func() { agentErr = n.runAgent(ctx, db, securityConfig.ClientTLSCreds, agentReady) wg.Done() cancel() }() go func() { <-agentReady if role == ca.ManagerRole { <-managerReady } close(n.ready) }() wg.Wait() if managerErr != nil && managerErr != context.Canceled { return managerErr } if agentErr != nil && agentErr != context.Canceled { return agentErr } return err }
// Run is the main loop for a Raft node, it goes along the state machine, // acting on the messages received from other Raft nodes in the cluster. // // Before running the main loop, it first starts the raft node based on saved // cluster state. If no saved state exists, it starts a single-node cluster. func (n *Node) Run(ctx context.Context) error { ctx = log.WithLogger(ctx, logrus.WithField("raft_id", fmt.Sprintf("%x", n.Config.ID))) ctx, cancel := context.WithCancel(ctx) // nodeRemoved indicates that node was stopped due its removal. nodeRemoved := false defer func() { cancel() n.stop(ctx) if nodeRemoved { // Move WAL and snapshot out of the way, since // they are no longer usable. if err := n.raftLogger.Clear(ctx); err != nil { log.G(ctx).WithError(err).Error("failed to move wal after node removal") } // clear out the DEKs if err := n.keyRotator.UpdateKeys(EncryptionKeys{}); err != nil { log.G(ctx).WithError(err).Error("could not remove DEKs") } } n.done() }() wasLeader := false for { select { case <-n.ticker.C(): n.raftNode.Tick() n.cluster.Tick() case rd := <-n.raftNode.Ready(): raftConfig := n.getCurrentRaftConfig() // Save entries to storage if err := n.saveToStorage(ctx, &raftConfig, rd.HardState, rd.Entries, rd.Snapshot); err != nil { log.G(ctx).WithError(err).Error("failed to save entries to storage") } if len(rd.Messages) != 0 { // Send raft messages to peers if err := n.send(ctx, rd.Messages); err != nil { log.G(ctx).WithError(err).Error("failed to send message to members") } } // Apply snapshot to memory store. The snapshot // was applied to the raft store in // saveToStorage. if !raft.IsEmptySnap(rd.Snapshot) { // Load the snapshot data into the store if err := n.restoreFromSnapshot(rd.Snapshot.Data, false); err != nil { log.G(ctx).WithError(err).Error("failed to restore from snapshot") } n.appliedIndex = rd.Snapshot.Metadata.Index n.snapshotIndex = rd.Snapshot.Metadata.Index n.confState = rd.Snapshot.Metadata.ConfState } // If we cease to be the leader, we must cancel any // proposals that are currently waiting for a quorum to // acknowledge them. It is still possible for these to // become committed, but if that happens we will apply // them as any follower would. // It is important that we cancel these proposals before // calling processCommitted, so processCommitted does // not deadlock. if rd.SoftState != nil { if wasLeader && rd.SoftState.RaftState != raft.StateLeader { wasLeader = false if atomic.LoadUint32(&n.signalledLeadership) == 1 { atomic.StoreUint32(&n.signalledLeadership, 0) n.leadershipBroadcast.Publish(IsFollower) } // It is important that we set n.signalledLeadership to 0 // before calling n.wait.cancelAll. When a new raft // request is registered, it checks n.signalledLeadership // afterwards, and cancels the registration if it is 0. // If cancelAll was called first, this call might run // before the new request registers, but // signalledLeadership would be set after the check. // Setting signalledLeadership before calling cancelAll // ensures that if a new request is registered during // this transition, it will either be cancelled by // cancelAll, or by its own check of signalledLeadership. n.wait.cancelAll() } else if !wasLeader && rd.SoftState.RaftState == raft.StateLeader { wasLeader = true } } // Process committed entries for _, entry := range rd.CommittedEntries { if err := n.processCommitted(ctx, entry); err != nil { log.G(ctx).WithError(err).Error("failed to process committed entries") } } // Trigger a snapshot every once in awhile if n.snapshotInProgress == nil && (n.keyRotator.NeedsRotation() || raftConfig.SnapshotInterval > 0 && n.appliedIndex-n.snapshotIndex >= raftConfig.SnapshotInterval) { n.doSnapshot(ctx, raftConfig) } if wasLeader && atomic.LoadUint32(&n.signalledLeadership) != 1 { // If all the entries in the log have become // committed, broadcast our leadership status. if n.caughtUp() { atomic.StoreUint32(&n.signalledLeadership, 1) n.leadershipBroadcast.Publish(IsLeader) } } // Advance the state machine n.raftNode.Advance() // On the first startup, or if we are the only // registered member after restoring from the state, // campaign to be the leader. if n.campaignWhenAble { members := n.cluster.Members() if len(members) >= 1 { n.campaignWhenAble = false } if len(members) == 1 && members[n.Config.ID] != nil { if err := n.raftNode.Campaign(ctx); err != nil { panic("raft: cannot campaign to be the leader on node restore") } } } case snapshotIndex := <-n.snapshotInProgress: if snapshotIndex > n.snapshotIndex { n.snapshotIndex = snapshotIndex } n.snapshotInProgress = nil if n.rotationQueued { // there was a key rotation that took place before while the snapshot // was in progress - we have to take another snapshot and encrypt with the new key n.doSnapshot(ctx, n.getCurrentRaftConfig()) } case <-n.keyRotator.RotationNotify(): // There are 2 separate checks: rotationQueued, and keyRotator.NeedsRotation(). // We set rotationQueued so that when we are notified of a rotation, we try to // do a snapshot as soon as possible. However, if there is an error while doing // the snapshot, we don't want to hammer the node attempting to do snapshots over // and over. So if doing a snapshot fails, wait until the next entry comes in to // try again. switch { case n.snapshotInProgress != nil: n.rotationQueued = true case n.keyRotator.NeedsRotation(): n.doSnapshot(ctx, n.getCurrentRaftConfig()) } case <-n.removeRaftCh: nodeRemoved = true // If the node was removed from other members, // send back an error to the caller to start // the shutdown process. return ErrMemberRemoved case <-ctx.Done(): return nil } } }
// Assign the set of tasks to the worker. Any tasks not previously known will // be started. Any tasks that are in the task set and already running will be // updated, if possible. Any tasks currently running on the // worker outside the task set will be terminated. func (w *worker) Assign(ctx context.Context, tasks []*api.Task) error { w.mu.Lock() defer w.mu.Unlock() tx, err := w.db.Begin(true) if err != nil { log.G(ctx).WithError(err).Error("failed starting transaction against task database") return err } defer tx.Rollback() log.G(ctx).WithField("len(tasks)", len(tasks)).Debug("(*worker).Assign") assigned := map[string]struct{}{} for _, task := range tasks { log.G(ctx).WithFields( logrus.Fields{ "task.id": task.ID, "task.desiredstate": task.DesiredState}).Debug("assigned") if err := PutTask(tx, task); err != nil { return err } if err := SetTaskAssignment(tx, task.ID, true); err != nil { return err } if mgr, ok := w.taskManagers[task.ID]; ok { if err := mgr.Update(ctx, task); err != nil && err != ErrClosed { log.G(ctx).WithError(err).Error("failed updating assigned task") } } else { // we may have still seen the task, let's grab the status from // storage and replace it with our status, if we have it. status, err := GetTaskStatus(tx, task.ID) if err != nil { if err != errTaskUnknown { return err } // never seen before, register the provided status if err := PutTaskStatus(tx, task.ID, &task.Status); err != nil { return err } status = &task.Status } else { task.Status = *status // overwrite the stale manager status with ours. } w.startTask(ctx, tx, task) } assigned[task.ID] = struct{}{} } for id, tm := range w.taskManagers { if _, ok := assigned[id]; ok { continue } ctx := log.WithLogger(ctx, log.G(ctx).WithField("task.id", id)) if err := SetTaskAssignment(tx, id, false); err != nil { log.G(ctx).WithError(err).Error("error setting task assignment in database") continue } delete(w.taskManagers, id) go func(tm *taskManager) { // when a task is no longer assigned, we shutdown the task manager for // it and leave cleanup to the sweeper. if err := tm.Close(); err != nil { log.G(ctx).WithError(err).Error("error closing task manager") } }(tm) } return tx.Commit() }