func (rm *roleManager) reconcileRole(node *api.Node) { if node.Role == node.Spec.DesiredRole { // Nothing to do. delete(rm.pending, node.ID) return } // Promotion can proceed right away. if node.Spec.DesiredRole == api.NodeRoleManager && node.Role == api.NodeRoleWorker { err := rm.store.Update(func(tx store.Tx) error { updatedNode := store.GetNode(tx, node.ID) if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role { return nil } updatedNode.Role = api.NodeRoleManager return store.UpdateNode(tx, updatedNode) }) if err != nil { log.L.WithError(err).Errorf("failed to promote node %s", node.ID) } else { delete(rm.pending, node.ID) } } else if node.Spec.DesiredRole == api.NodeRoleWorker && node.Role == api.NodeRoleManager { // Check for node in memberlist member := rm.raft.GetMemberByNodeID(node.ID) if member != nil { // Quorum safeguard if !rm.raft.CanRemoveMember(member.RaftID) { // TODO(aaronl): Retry later log.L.Debugf("can't demote node %s at this time: removing member from raft would result in a loss of quorum", node.ID) return } rmCtx, rmCancel := context.WithTimeout(rm.ctx, 5*time.Second) defer rmCancel() if err := rm.raft.RemoveMember(rmCtx, member.RaftID); err != nil { // TODO(aaronl): Retry later log.L.WithError(err).Debugf("can't demote node %s at this time", node.ID) return } } err := rm.store.Update(func(tx store.Tx) error { updatedNode := store.GetNode(tx, node.ID) if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role { return nil } updatedNode.Role = api.NodeRoleWorker return store.UpdateNode(tx, updatedNode) }) if err != nil { log.L.WithError(err).Errorf("failed to demote node %s", node.ID) } else { delete(rm.pending, node.ID) } } }
func (a *Allocator) allocateNode(ctx context.Context, nc *networkContext, node *api.Node) error { if err := nc.nwkAllocator.AllocateNode(node); err != nil { return err } if err := a.store.Update(func(tx store.Tx) error { for { err := store.UpdateNode(tx, node) if err != nil && err != store.ErrSequenceConflict { return fmt.Errorf("failed updating state in store transaction for node %s: %v", node.ID, err) } if err == store.ErrSequenceConflict { storeNode := store.GetNode(tx, node.ID) storeNode.Attachment = node.Attachment.Copy() node = storeNode continue } break } return nil }); err != nil { if err := nc.nwkAllocator.DeallocateNode(node); err != nil { log.G(ctx).WithError(err).Errorf("failed rolling back allocation of node %s: %v", node.ID, err) } return err } return nil }
func updateNodeAvailability(t *testing.T, s *store.MemoryStore, node *api.Node, avail api.NodeSpec_Availability) { node.Spec.Availability = avail s.Update(func(tx store.Tx) error { assert.NoError(t, store.UpdateNode(tx, node)) return nil }) }
func TestGetRemoteSignedCertificateWithPending(t *testing.T) { tc := testutils.NewTestCA(t) defer tc.Stop() // Create a new CSR to be signed csr, _, err := ca.GenerateAndWriteNewKey(tc.Paths.Node) assert.NoError(t, err) updates, cancel := state.Watch(tc.MemoryStore.WatchQueue(), state.EventCreateNode{}) defer cancel() completed := make(chan error) go func() { _, err := ca.GetRemoteSignedCertificate(context.Background(), csr, tc.WorkerToken, tc.RootCA.Pool, tc.Remotes, nil, nil) completed <- err }() event := <-updates node := event.(state.EventCreateNode).Node.Copy() // Directly update the status of the store err = tc.MemoryStore.Update(func(tx store.Tx) error { node.Certificate.Status.State = api.IssuanceStateIssued return store.UpdateNode(tx, node) }) assert.NoError(t, err) // Make sure GetRemoteSignedCertificate didn't return an error assert.NoError(t, <-completed) }
// UpdateNode updates a Node referenced by NodeID with the given NodeSpec. // - Returns `NotFound` if the Node is not found. // - Returns `InvalidArgument` if the NodeSpec is malformed. // - Returns an error if the update fails. func (s *Server) UpdateNode(ctx context.Context, request *api.UpdateNodeRequest) (*api.UpdateNodeResponse, error) { if request.NodeID == "" || request.NodeVersion == nil { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } if err := validateNodeSpec(request.Spec); err != nil { return nil, err } var ( node *api.Node demote bool ) err := s.store.Update(func(tx store.Tx) error { node = store.GetNode(tx, request.NodeID) if node == nil { return nil } // Demotion sanity checks. if node.Spec.Role == api.NodeRoleManager && request.Spec.Role == api.NodeRoleWorker { demote = true managers, err := store.FindNodes(tx, store.ByRole(api.NodeRoleManager)) if err != nil { return grpc.Errorf(codes.Internal, "internal store error: %v", err) } if len(managers) == 1 && managers[0].ID == node.ID { return grpc.Errorf(codes.FailedPrecondition, "attempting to demote the last manager of the swarm") } } node.Meta.Version = *request.NodeVersion node.Spec = *request.Spec.Copy() return store.UpdateNode(tx, node) }) if err != nil { return nil, err } if node == nil { return nil, grpc.Errorf(codes.NotFound, "node %s not found", request.NodeID) } if demote && s.raft != nil { memberlist := s.raft.GetMemberlist() for raftID, member := range memberlist { if member.NodeID == request.NodeID { if err := s.raft.RemoveMember(ctx, raftID); err != nil { return nil, err } break } } } return &api.UpdateNodeResponse{ Node: node, }, nil }
// UpdateNode updates a Node referenced by NodeID with the given NodeSpec. // - Returns `NotFound` if the Node is not found. // - Returns `InvalidArgument` if the NodeSpec is malformed. // - Returns an error if the update fails. func (s *Server) UpdateNode(ctx context.Context, request *api.UpdateNodeRequest) (*api.UpdateNodeResponse, error) { if request.NodeID == "" || request.NodeVersion == nil { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } if err := validateNodeSpec(request.Spec); err != nil { return nil, err } var ( node *api.Node member *membership.Member ) err := s.store.Update(func(tx store.Tx) error { node = store.GetNode(tx, request.NodeID) if node == nil { return grpc.Errorf(codes.NotFound, "node %s not found", request.NodeID) } // Demotion sanity checks. if node.Spec.DesiredRole == api.NodeRoleManager && request.Spec.DesiredRole == api.NodeRoleWorker { // Check for manager entries in Store. managers, err := store.FindNodes(tx, store.ByRole(api.NodeRoleManager)) if err != nil { return grpc.Errorf(codes.Internal, "internal store error: %v", err) } if len(managers) == 1 && managers[0].ID == node.ID { return grpc.Errorf(codes.FailedPrecondition, "attempting to demote the last manager of the swarm") } // Check for node in memberlist if member = s.raft.GetMemberByNodeID(request.NodeID); member == nil { return grpc.Errorf(codes.NotFound, "can't find manager in raft memberlist") } // Quorum safeguard if !s.raft.CanRemoveMember(member.RaftID) { return grpc.Errorf(codes.FailedPrecondition, "can't remove member from the raft: this would result in a loss of quorum") } } node.Meta.Version = *request.NodeVersion node.Spec = *request.Spec.Copy() return store.UpdateNode(tx, node) }) if err != nil { return nil, err } return &api.UpdateNodeResponse{ Node: node, }, nil }
func (a *Allocator) commitAllocatedNode(ctx context.Context, batch *store.Batch, node *api.Node) error { if err := batch.Update(func(tx store.Tx) error { err := store.UpdateNode(tx, node) if err == store.ErrSequenceConflict { storeNode := store.GetNode(tx, node.ID) storeNode.Attachment = node.Attachment.Copy() err = store.UpdateNode(tx, storeNode) } return errors.Wrapf(err, "failed updating state in store transaction for node %s", node.ID) }); err != nil { if err := a.netCtx.nwkAllocator.DeallocateNode(node); err != nil { log.G(ctx).WithError(err).Errorf("failed rolling back allocation of node %s", node.ID) } return err } return nil }
func (d *Dispatcher) markNodesUnknown(ctx context.Context) error { log := log.G(ctx).WithField("method", "(*Dispatcher).markNodesUnknown") var nodes []*api.Node var err error d.store.View(func(tx store.ReadTx) { nodes, err = store.FindNodes(tx, store.All) }) if err != nil { return fmt.Errorf("failed to get list of nodes: %v", err) } _, err = d.store.Batch(func(batch *store.Batch) error { for _, n := range nodes { err := batch.Update(func(tx store.Tx) error { // check if node is still here node := store.GetNode(tx, n.ID) if node == nil { return nil } // do not try to resurrect down nodes if node.Status.State == api.NodeStatus_DOWN { return nil } node.Status = api.NodeStatus{ State: api.NodeStatus_UNKNOWN, Message: `Node moved to "unknown" state due to leadership change in cluster`, } nodeID := node.ID expireFunc := func() { log := log.WithField("node", nodeID) nodeStatus := api.NodeStatus{State: api.NodeStatus_DOWN, Message: `heartbeat failure for node in "unknown" state`} log.Debugf("heartbeat expiration for unknown node") if err := d.nodeRemove(nodeID, nodeStatus); err != nil { log.WithError(err).Errorf(`failed deregistering node after heartbeat expiration for node in "unknown" state`) } } if err := d.nodes.AddUnknown(node, expireFunc); err != nil { return fmt.Errorf(`adding node in "unknown" state to node store failed: %v`, err) } if err := store.UpdateNode(tx, node); err != nil { return fmt.Errorf("update failed %v", err) } return nil }) if err != nil { log.WithField("node", n.ID).WithError(err).Errorf(`failed to move node to "unknown" state`) } } return nil }) return err }
// register is used for registration of node with particular dispatcher. func (d *Dispatcher) register(ctx context.Context, nodeID string, description *api.NodeDescription) (string, error) { // prevent register until we're ready to accept it if err := d.isRunningLocked(); err != nil { return "", err } if err := d.nodes.CheckRateLimit(nodeID); err != nil { return "", err } // create or update node in store // TODO(stevvooe): Validate node specification. var node *api.Node err := d.store.Update(func(tx store.Tx) error { node = store.GetNode(tx, nodeID) if node == nil { return ErrNodeNotFound } node.Description = description node.Status = api.NodeStatus{ State: api.NodeStatus_READY, } return store.UpdateNode(tx, node) }) if err != nil { return "", err } expireFunc := func() { nodeStatus := api.NodeStatus{State: api.NodeStatus_DOWN, Message: "heartbeat failure"} log.G(ctx).Debugf("heartbeat expiration") if err := d.nodeRemove(nodeID, nodeStatus); err != nil { log.G(ctx).WithError(err).Errorf("failed deregistering node after heartbeat expiration") } } rn := d.nodes.Add(node, expireFunc) // NOTE(stevvooe): We need be a little careful with re-registration. The // current implementation just matches the node id and then gives away the // sessionID. If we ever want to use sessionID as a secret, which we may // want to, this is giving away the keys to the kitchen. // // The right behavior is going to be informed by identity. Basically, each // time a node registers, we invalidate the session and issue a new // session, once identity is proven. This will cause misbehaved agents to // be kicked when multiple connections are made. return rn.SessionID, nil }
// issueRenewCertificate receives a nodeID and a CSR and modifies the node's certificate entry with the new CSR // and changes the state to RENEW, so it can be picked up and signed by the signing reconciliation loop func (s *Server) issueRenewCertificate(ctx context.Context, nodeID string, csr []byte) (*api.IssueNodeCertificateResponse, error) { var ( cert api.Certificate node *api.Node ) err := s.store.Update(func(tx store.Tx) error { // Attempt to retrieve the node with nodeID node = store.GetNode(tx, nodeID) if node == nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": nodeID, "method": "issueRenewCertificate", }).Warnf("node does not exist") // If this node doesn't exist, we shouldn't be renewing a certificate for it return grpc.Errorf(codes.NotFound, "node %s not found when attempting to renew certificate", nodeID) } // Create a new Certificate entry for this node with the new CSR and a RENEW state cert = api.Certificate{ CSR: csr, CN: node.ID, Role: node.Spec.Role, Status: api.IssuanceStatus{ State: api.IssuanceStateRenew, }, } node.Certificate = cert return store.UpdateNode(tx, node) }) if err != nil { return nil, err } log.G(ctx).WithFields(logrus.Fields{ "cert.cn": cert.CN, "cert.role": cert.Role, "method": "issueRenewCertificate", }).Debugf("node certificate updated") return &api.IssueNodeCertificateResponse{ NodeID: nodeID, NodeMembership: node.Spec.Membership, }, nil }
func TestForceRotationIsNoop(t *testing.T) { tc := testutils.NewTestCA(t) defer tc.Stop() // Get a new Certificate issued csr, _, err := ca.GenerateNewCSR() assert.NoError(t, err) issueRequest := &api.IssueNodeCertificateRequest{CSR: csr, Token: tc.WorkerToken} issueResponse, err := tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.NoError(t, err) assert.NotNil(t, issueResponse.NodeID) assert.Equal(t, api.NodeMembershipAccepted, issueResponse.NodeMembership) // Check that the Certificate is successfully issued statusRequest := &api.NodeCertificateStatusRequest{NodeID: issueResponse.NodeID} statusResponse, err := tc.NodeCAClients[0].NodeCertificateStatus(context.Background(), statusRequest) require.NoError(t, err) assert.Equal(t, api.IssuanceStateIssued, statusResponse.Status.State) assert.NotNil(t, statusResponse.Certificate.Certificate) assert.Equal(t, api.NodeRoleWorker, statusResponse.Certificate.Role) // Update the certificate status to IssuanceStateRotate which should be a server-side noop err = tc.MemoryStore.Update(func(tx store.Tx) error { // Attempt to retrieve the node with nodeID node := store.GetNode(tx, issueResponse.NodeID) assert.NotNil(t, node) node.Certificate.Status.State = api.IssuanceStateRotate return store.UpdateNode(tx, node) }) assert.NoError(t, err) // Wait a bit and check that the certificate hasn't changed/been reissued time.Sleep(250 * time.Millisecond) statusNewResponse, err := tc.NodeCAClients[0].NodeCertificateStatus(context.Background(), statusRequest) require.NoError(t, err) assert.Equal(t, statusResponse.Certificate.Certificate, statusNewResponse.Certificate.Certificate) assert.Equal(t, api.IssuanceStateRotate, statusNewResponse.Certificate.Status.State) assert.Equal(t, api.NodeRoleWorker, statusNewResponse.Certificate.Role) }
func (d *Dispatcher) nodeRemove(id string, status api.NodeStatus) error { if err := d.isRunningLocked(); err != nil { return err } // TODO(aaronl): Is it worth batching node removals? err := d.store.Update(func(tx store.Tx) error { node := store.GetNode(tx, id) if node == nil { return errors.New("node not found") } node.Status = status return store.UpdateNode(tx, node) }) if err != nil { return fmt.Errorf("failed to update node %s status to down: %v", id, err) } if rn := d.nodes.Delete(id); rn == nil { return fmt.Errorf("node %s is not found in local storage", id) } return nil }
func (d *Dispatcher) markNodesUnknown(ctx context.Context) error { log := log.G(ctx).WithField("method", "(*Dispatcher).markNodesUnknown") var nodes []*api.Node var err error d.store.View(func(tx store.ReadTx) { nodes, err = store.FindNodes(tx, store.All) }) if err != nil { return errors.Wrap(err, "failed to get list of nodes") } _, err = d.store.Batch(func(batch *store.Batch) error { for _, n := range nodes { err := batch.Update(func(tx store.Tx) error { // check if node is still here node := store.GetNode(tx, n.ID) if node == nil { return nil } // do not try to resurrect down nodes if node.Status.State == api.NodeStatus_DOWN { nodeCopy := node expireFunc := func() { if err := d.moveTasksToOrphaned(nodeCopy.ID); err != nil { log.WithError(err).Error(`failed to move all tasks to "ORPHANED" state`) } d.downNodes.Delete(nodeCopy.ID) } d.downNodes.Add(nodeCopy, expireFunc) return nil } node.Status.State = api.NodeStatus_UNKNOWN node.Status.Message = `Node moved to "unknown" state due to leadership change in cluster` nodeID := node.ID expireFunc := func() { log := log.WithField("node", nodeID) log.Debugf("heartbeat expiration for unknown node") if err := d.markNodeNotReady(nodeID, api.NodeStatus_DOWN, `heartbeat failure for node in "unknown" state`); err != nil { log.WithError(err).Errorf(`failed deregistering node after heartbeat expiration for node in "unknown" state`) } } if err := d.nodes.AddUnknown(node, expireFunc); err != nil { return errors.Wrap(err, `adding node in "unknown" state to node store failed`) } if err := store.UpdateNode(tx, node); err != nil { return errors.Wrap(err, "update failed") } return nil }) if err != nil { log.WithField("node", n.ID).WithError(err).Errorf(`failed to move node to "unknown" state`) } } return nil }) return err }
func TestDrain(t *testing.T) { ctx := context.Background() initialService := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{}, }, Restart: &api.RestartPolicy{ Condition: api.RestartOnNone, }, }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 6, }, }, }, } initialNodeSet := []*api.Node{ { ID: "id1", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name1", }, Availability: api.NodeAvailabilityActive, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, }, { ID: "id2", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name2", }, Availability: api.NodeAvailabilityActive, }, Status: api.NodeStatus{ State: api.NodeStatus_DOWN, }, }, // We should NOT kick out tasks on UNKNOWN nodes. { ID: "id3", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name3", }, Availability: api.NodeAvailabilityActive, }, Status: api.NodeStatus{ State: api.NodeStatus_UNKNOWN, }, }, { ID: "id4", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name4", }, Availability: api.NodeAvailabilityPause, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, }, { ID: "id5", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name5", }, Availability: api.NodeAvailabilityDrain, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, }, } initialTaskSet := []*api.Task{ // Task not assigned to any node { ID: "id0", Status: api.TaskStatus{ State: api.TaskStateNew, }, Slot: 1, ServiceAnnotations: api.Annotations{ Name: "name0", }, ServiceID: "id1", }, // Tasks assigned to the nodes defined above { ID: "id1", Status: api.TaskStatus{ State: api.TaskStateNew, }, Slot: 2, ServiceAnnotations: api.Annotations{ Name: "name1", }, ServiceID: "id1", NodeID: "id1", }, { ID: "id2", Status: api.TaskStatus{ State: api.TaskStateNew, }, Slot: 3, ServiceAnnotations: api.Annotations{ Name: "name2", }, ServiceID: "id1", NodeID: "id2", }, { ID: "id3", Status: api.TaskStatus{ State: api.TaskStateNew, }, Slot: 4, ServiceAnnotations: api.Annotations{ Name: "name3", }, ServiceID: "id1", NodeID: "id3", }, { ID: "id4", Status: api.TaskStatus{ State: api.TaskStateNew, }, Slot: 5, ServiceAnnotations: api.Annotations{ Name: "name4", }, ServiceID: "id1", NodeID: "id4", }, { ID: "id5", Status: api.TaskStatus{ State: api.TaskStateNew, }, Slot: 6, ServiceAnnotations: api.Annotations{ Name: "name5", }, ServiceID: "id1", NodeID: "id5", }, } s := store.NewMemoryStore(nil) assert.NotNil(t, s) defer s.Close() err := s.Update(func(tx store.Tx) error { // Prepopulate service assert.NoError(t, store.CreateService(tx, initialService)) // Prepoulate nodes for _, n := range initialNodeSet { assert.NoError(t, store.CreateNode(tx, n)) } // Prepopulate tasks for _, task := range initialTaskSet { assert.NoError(t, store.CreateTask(tx, task)) } return nil }) assert.NoError(t, err) watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{}) defer cancel() orchestrator := NewReplicatedOrchestrator(s) defer orchestrator.Stop() go func() { assert.NoError(t, orchestrator.Run(ctx)) }() // id2 and id5 should be killed immediately deletion1 := watchShutdownTask(t, watch) deletion2 := watchShutdownTask(t, watch) assert.Regexp(t, "id(2|5)", deletion1.ID) assert.Regexp(t, "id(2|5)", deletion1.NodeID) assert.Regexp(t, "id(2|5)", deletion2.ID) assert.Regexp(t, "id(2|5)", deletion2.NodeID) // Create a new task, assigned to node id2 err = s.Update(func(tx store.Tx) error { task := initialTaskSet[2].Copy() task.ID = "newtask" task.NodeID = "id2" assert.NoError(t, store.CreateTask(tx, task)) return nil }) assert.NoError(t, err) deletion3 := watchShutdownTask(t, watch) assert.Equal(t, "newtask", deletion3.ID) assert.Equal(t, "id2", deletion3.NodeID) // Set node id4 to the DRAINED state err = s.Update(func(tx store.Tx) error { n := initialNodeSet[3].Copy() n.Spec.Availability = api.NodeAvailabilityDrain assert.NoError(t, store.UpdateNode(tx, n)) return nil }) assert.NoError(t, err) deletion4 := watchShutdownTask(t, watch) assert.Equal(t, "id4", deletion4.ID) assert.Equal(t, "id4", deletion4.NodeID) // Delete node id1 err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.DeleteNode(tx, "id1")) return nil }) assert.NoError(t, err) deletion5 := watchShutdownTask(t, watch) assert.Equal(t, "id1", deletion5.ID) assert.Equal(t, "id1", deletion5.NodeID) }
func TestIsStateDirty(t *testing.T) { ctx := context.Background() temp, err := ioutil.TempFile("", "test-socket") assert.NoError(t, err) assert.NoError(t, temp.Close()) assert.NoError(t, os.Remove(temp.Name())) defer os.RemoveAll(temp.Name()) stateDir, err := ioutil.TempDir("", "test-raft") assert.NoError(t, err) defer os.RemoveAll(stateDir) tc := testutils.NewTestCA(t, func(p ca.CertPaths) *ca.KeyReadWriter { return ca.NewKeyReadWriter(p, []byte("kek"), nil) }) defer tc.Stop() managerSecurityConfig, err := tc.NewNodeConfig(ca.ManagerRole) assert.NoError(t, err) m, err := New(&Config{ RemoteAPI: &RemoteAddrs{ListenAddr: "127.0.0.1:0"}, ControlAPI: temp.Name(), StateDir: stateDir, SecurityConfig: managerSecurityConfig, AutoLockManagers: true, UnlockKey: []byte("kek"), }) assert.NoError(t, err) assert.NotNil(t, m) go m.Run(ctx) defer m.Stop(ctx, false) // State should never be dirty just after creating the manager isDirty, err := m.IsStateDirty() assert.NoError(t, err) assert.False(t, isDirty) // Wait for cluster and node to be created. watch, cancel := state.Watch(m.raftNode.MemoryStore().WatchQueue()) defer cancel() <-watch <-watch // Updating the node should not cause the state to become dirty assert.NoError(t, m.raftNode.MemoryStore().Update(func(tx store.Tx) error { node := store.GetNode(tx, m.config.SecurityConfig.ClientTLSCreds.NodeID()) require.NotNil(t, node) node.Spec.Availability = api.NodeAvailabilityPause return store.UpdateNode(tx, node) })) isDirty, err = m.IsStateDirty() assert.NoError(t, err) assert.False(t, isDirty) // Adding a service should cause the state to become dirty assert.NoError(t, m.raftNode.MemoryStore().Update(func(tx store.Tx) error { return store.CreateService(tx, &api.Service{ID: "foo"}) })) isDirty, err = m.IsStateDirty() assert.NoError(t, err) assert.True(t, isDirty) }
func (d *Dispatcher) processUpdates(ctx context.Context) { var ( taskUpdates map[string]*api.TaskStatus nodeUpdates map[string]nodeUpdate ) d.taskUpdatesLock.Lock() if len(d.taskUpdates) != 0 { taskUpdates = d.taskUpdates d.taskUpdates = make(map[string]*api.TaskStatus) } d.taskUpdatesLock.Unlock() d.nodeUpdatesLock.Lock() if len(d.nodeUpdates) != 0 { nodeUpdates = d.nodeUpdates d.nodeUpdates = make(map[string]nodeUpdate) } d.nodeUpdatesLock.Unlock() if len(taskUpdates) == 0 && len(nodeUpdates) == 0 { return } log := log.G(ctx).WithFields(logrus.Fields{ "method": "(*Dispatcher).processUpdates", }) _, err := d.store.Batch(func(batch *store.Batch) error { for taskID, status := range taskUpdates { err := batch.Update(func(tx store.Tx) error { logger := log.WithField("task.id", taskID) task := store.GetTask(tx, taskID) if task == nil { logger.Errorf("task unavailable") return nil } logger = logger.WithField("state.transition", fmt.Sprintf("%v->%v", task.Status.State, status.State)) if task.Status == *status { logger.Debug("task status identical, ignoring") return nil } if task.Status.State > status.State { logger.Debug("task status invalid transition") return nil } task.Status = *status if err := store.UpdateTask(tx, task); err != nil { logger.WithError(err).Error("failed to update task status") return nil } logger.Debug("task status updated") return nil }) if err != nil { log.WithError(err).Error("dispatcher task update transaction failed") } } for nodeID, nodeUpdate := range nodeUpdates { err := batch.Update(func(tx store.Tx) error { logger := log.WithField("node.id", nodeID) node := store.GetNode(tx, nodeID) if node == nil { logger.Errorf("node unavailable") return nil } if nodeUpdate.status != nil { node.Status.State = nodeUpdate.status.State node.Status.Message = nodeUpdate.status.Message if nodeUpdate.status.Addr != "" { node.Status.Addr = nodeUpdate.status.Addr } } if nodeUpdate.description != nil { node.Description = nodeUpdate.description } if err := store.UpdateNode(tx, node); err != nil { logger.WithError(err).Error("failed to update node status") return nil } logger.Debug("node status updated") return nil }) if err != nil { log.WithError(err).Error("dispatcher node update transaction failed") } } return nil }) if err != nil { log.WithError(err).Error("dispatcher batch failed") } d.processUpdatesCond.Broadcast() }
func TestConstraintEnforcer(t *testing.T) { nodes := []*api.Node{ { ID: "id1", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name1", }, Availability: api.NodeAvailabilityActive, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, Role: api.NodeRoleWorker, }, { ID: "id2", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name2", }, Availability: api.NodeAvailabilityActive, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, Description: &api.NodeDescription{ Resources: &api.Resources{ NanoCPUs: 1e9, MemoryBytes: 1e9, }, }, }, } tasks := []*api.Task{ { ID: "id0", DesiredState: api.TaskStateRunning, Spec: api.TaskSpec{ Placement: &api.Placement{ Constraints: []string{"node.role == manager"}, }, }, Status: api.TaskStatus{ State: api.TaskStateNew, }, NodeID: "id1", }, { ID: "id1", DesiredState: api.TaskStateRunning, Status: api.TaskStatus{ State: api.TaskStateNew, }, NodeID: "id1", }, { ID: "id2", DesiredState: api.TaskStateRunning, Spec: api.TaskSpec{ Placement: &api.Placement{ Constraints: []string{"node.role == worker"}, }, }, Status: api.TaskStatus{ State: api.TaskStateRunning, }, NodeID: "id1", }, { ID: "id3", DesiredState: api.TaskStateNew, Status: api.TaskStatus{ State: api.TaskStateNew, }, NodeID: "id2", }, { ID: "id4", DesiredState: api.TaskStateReady, Spec: api.TaskSpec{ Resources: &api.ResourceRequirements{ Reservations: &api.Resources{ MemoryBytes: 9e8, }, }, }, Status: api.TaskStatus{ State: api.TaskStatePending, }, NodeID: "id2", }, } s := store.NewMemoryStore(nil) assert.NotNil(t, s) defer s.Close() err := s.Update(func(tx store.Tx) error { // Prepoulate nodes for _, n := range nodes { assert.NoError(t, store.CreateNode(tx, n)) } // Prepopulate tasks for _, task := range tasks { assert.NoError(t, store.CreateTask(tx, task)) } return nil }) assert.NoError(t, err) watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{}) defer cancel() constraintEnforcer := New(s) defer constraintEnforcer.Stop() go constraintEnforcer.Run() // id0 should be killed immediately shutdown1 := testutils.WatchShutdownTask(t, watch) assert.Equal(t, "id0", shutdown1.ID) // Change node id1 to a manager err = s.Update(func(tx store.Tx) error { node := store.GetNode(tx, "id1") if node == nil { t.Fatal("could not get node id1") } node.Role = api.NodeRoleManager assert.NoError(t, store.UpdateNode(tx, node)) return nil }) assert.NoError(t, err) shutdown2 := testutils.WatchShutdownTask(t, watch) assert.Equal(t, "id2", shutdown2.ID) // Change resources on node id2 err = s.Update(func(tx store.Tx) error { node := store.GetNode(tx, "id2") if node == nil { t.Fatal("could not get node id2") } node.Description.Resources.MemoryBytes = 5e8 assert.NoError(t, store.UpdateNode(tx, node)) return nil }) assert.NoError(t, err) shutdown3 := testutils.WatchShutdownTask(t, watch) assert.Equal(t, "id4", shutdown3.ID) }
// signNodeCert does the bulk of the work for signing a certificate func (s *Server) signNodeCert(ctx context.Context, node *api.Node) { rootCA := s.securityConfig.RootCA() externalCA := s.securityConfig.externalCA node = node.Copy() nodeID := node.ID // Convert the role from proto format role, err := ParseRole(node.Certificate.Role) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("failed to parse role") return } // Attempt to sign the CSR var ( rawCSR = node.Certificate.CSR cn = node.Certificate.CN ou = role org = s.securityConfig.ClientTLSCreds.Organization() ) // Try using the external CA first. cert, err := externalCA.Sign(PrepareCSR(rawCSR, cn, ou, org)) if err == ErrNoExternalCAURLs { // No external CA servers configured. Try using the local CA. cert, err = rootCA.ParseValidateAndSignCSR(rawCSR, cn, ou, org) } if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("failed to sign CSR") // If this error is due the lack of signer, maybe some other // manager in the future will pick it up. Return without // changing the state of the certificate. if err == ErrNoValidSigner { return } // If the current state is already Failed, no need to change it if node.Certificate.Status.State == api.IssuanceStateFailed { return } // We failed to sign this CSR, change the state to FAILED err = s.store.Update(func(tx store.Tx) error { node := store.GetNode(tx, nodeID) if node == nil { return fmt.Errorf("node %s not found", nodeID) } node.Certificate.Status = api.IssuanceStatus{ State: api.IssuanceStateFailed, Err: err.Error(), } return store.UpdateNode(tx, node) }) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": nodeID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("transaction failed when setting state to FAILED") } return } // We were able to successfully sign the new CSR. Let's try to update the nodeStore for { err = s.store.Update(func(tx store.Tx) error { node.Certificate.Certificate = cert node.Certificate.Status = api.IssuanceStatus{ State: api.IssuanceStateIssued, } err := store.UpdateNode(tx, node) if err != nil { node = store.GetNode(tx, nodeID) if node == nil { err = fmt.Errorf("node %s does not exist", nodeID) } } return err }) if err == nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "node.role": node.Certificate.Role, "method": "(*Server).signNodeCert", }).Debugf("certificate issued") break } if err == store.ErrSequenceConflict { continue } log.G(ctx).WithFields(logrus.Fields{ "node.id": nodeID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("transaction failed") return } }
func TestScheduler(t *testing.T) { ctx := context.Background() initialNodeSet := []*api.Node{ { ID: "id1", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name1", }, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, }, { ID: "id2", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name2", }, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, }, { ID: "id3", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name2", }, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, }, } initialTaskSet := []*api.Task{ { ID: "id1", DesiredState: api.TaskStateRunning, ServiceAnnotations: api.Annotations{ Name: "name1", }, Status: api.TaskStatus{ State: api.TaskStateAssigned, }, NodeID: initialNodeSet[0].ID, }, { ID: "id2", DesiredState: api.TaskStateRunning, ServiceAnnotations: api.Annotations{ Name: "name2", }, Status: api.TaskStatus{ State: api.TaskStatePending, }, }, { ID: "id3", DesiredState: api.TaskStateRunning, ServiceAnnotations: api.Annotations{ Name: "name2", }, Status: api.TaskStatus{ State: api.TaskStatePending, }, }, } s := store.NewMemoryStore(nil) assert.NotNil(t, s) defer s.Close() err := s.Update(func(tx store.Tx) error { // Prepoulate nodes for _, n := range initialNodeSet { assert.NoError(t, store.CreateNode(tx, n)) } // Prepopulate tasks for _, task := range initialTaskSet { assert.NoError(t, store.CreateTask(tx, task)) } return nil }) assert.NoError(t, err) scheduler := New(s) watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{}) defer cancel() go func() { assert.NoError(t, scheduler.Run(ctx)) }() defer scheduler.Stop() assignment1 := watchAssignment(t, watch) // must assign to id2 or id3 since id1 already has a task assert.Regexp(t, assignment1.NodeID, "(id2|id3)") assignment2 := watchAssignment(t, watch) // must assign to id2 or id3 since id1 already has a task if assignment1.NodeID == "id2" { assert.Equal(t, "id3", assignment2.NodeID) } else { assert.Equal(t, "id2", assignment2.NodeID) } err = s.Update(func(tx store.Tx) error { // Update each node to make sure this doesn't mess up the // scheduler's state. for _, n := range initialNodeSet { assert.NoError(t, store.UpdateNode(tx, n)) } return nil }) assert.NoError(t, err) err = s.Update(func(tx store.Tx) error { // Delete the task associated with node 1 so it's now the most lightly // loaded node. assert.NoError(t, store.DeleteTask(tx, "id1")) // Create a new task. It should get assigned to id1. t4 := &api.Task{ ID: "id4", DesiredState: api.TaskStateRunning, ServiceAnnotations: api.Annotations{ Name: "name4", }, Status: api.TaskStatus{ State: api.TaskStatePending, }, } assert.NoError(t, store.CreateTask(tx, t4)) return nil }) assert.NoError(t, err) assignment3 := watchAssignment(t, watch) assert.Equal(t, "id1", assignment3.NodeID) // Update a task to make it unassigned. It should get assigned by the // scheduler. err = s.Update(func(tx store.Tx) error { // Remove assignment from task id4. It should get assigned // to node id1. t4 := &api.Task{ ID: "id4", DesiredState: api.TaskStateRunning, ServiceAnnotations: api.Annotations{ Name: "name4", }, Status: api.TaskStatus{ State: api.TaskStatePending, }, } assert.NoError(t, store.UpdateTask(tx, t4)) return nil }) assert.NoError(t, err) assignment4 := watchAssignment(t, watch) assert.Equal(t, "id1", assignment4.NodeID) err = s.Update(func(tx store.Tx) error { // Create a ready node, then remove it. No tasks should ever // be assigned to it. node := &api.Node{ ID: "removednode", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "removednode", }, }, Status: api.NodeStatus{ State: api.NodeStatus_DOWN, }, } assert.NoError(t, store.CreateNode(tx, node)) assert.NoError(t, store.DeleteNode(tx, node.ID)) // Create an unassigned task. task := &api.Task{ ID: "removednode", DesiredState: api.TaskStateRunning, ServiceAnnotations: api.Annotations{ Name: "removednode", }, Status: api.TaskStatus{ State: api.TaskStatePending, }, } assert.NoError(t, store.CreateTask(tx, task)) return nil }) assert.NoError(t, err) assignmentRemovedNode := watchAssignment(t, watch) assert.NotEqual(t, "removednode", assignmentRemovedNode.NodeID) err = s.Update(func(tx store.Tx) error { // Create a ready node. It should be used for the next // assignment. n4 := &api.Node{ ID: "id4", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name4", }, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, } assert.NoError(t, store.CreateNode(tx, n4)) // Create an unassigned task. t5 := &api.Task{ ID: "id5", DesiredState: api.TaskStateRunning, ServiceAnnotations: api.Annotations{ Name: "name5", }, Status: api.TaskStatus{ State: api.TaskStatePending, }, } assert.NoError(t, store.CreateTask(tx, t5)) return nil }) assert.NoError(t, err) assignment5 := watchAssignment(t, watch) assert.Equal(t, "id4", assignment5.NodeID) err = s.Update(func(tx store.Tx) error { // Create a non-ready node. It should NOT be used for the next // assignment. n5 := &api.Node{ ID: "id5", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name5", }, }, Status: api.NodeStatus{ State: api.NodeStatus_DOWN, }, } assert.NoError(t, store.CreateNode(tx, n5)) // Create an unassigned task. t6 := &api.Task{ ID: "id6", DesiredState: api.TaskStateRunning, ServiceAnnotations: api.Annotations{ Name: "name6", }, Status: api.TaskStatus{ State: api.TaskStatePending, }, } assert.NoError(t, store.CreateTask(tx, t6)) return nil }) assert.NoError(t, err) assignment6 := watchAssignment(t, watch) assert.NotEqual(t, "id5", assignment6.NodeID) err = s.Update(func(tx store.Tx) error { // Update node id5 to put it in the READY state. n5 := &api.Node{ ID: "id5", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name5", }, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, } assert.NoError(t, store.UpdateNode(tx, n5)) // Create an unassigned task. Should be assigned to the // now-ready node. t7 := &api.Task{ ID: "id7", DesiredState: api.TaskStateRunning, ServiceAnnotations: api.Annotations{ Name: "name7", }, Status: api.TaskStatus{ State: api.TaskStatePending, }, } assert.NoError(t, store.CreateTask(tx, t7)) return nil }) assert.NoError(t, err) assignment7 := watchAssignment(t, watch) assert.Equal(t, "id5", assignment7.NodeID) err = s.Update(func(tx store.Tx) error { // Create a ready node, then immediately take it down. The next // unassigned task should NOT be assigned to it. n6 := &api.Node{ ID: "id6", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name6", }, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, } assert.NoError(t, store.CreateNode(tx, n6)) n6.Status.State = api.NodeStatus_DOWN assert.NoError(t, store.UpdateNode(tx, n6)) // Create an unassigned task. t8 := &api.Task{ ID: "id8", DesiredState: api.TaskStateRunning, ServiceAnnotations: api.Annotations{ Name: "name8", }, Status: api.TaskStatus{ State: api.TaskStatePending, }, } assert.NoError(t, store.CreateTask(tx, t8)) return nil }) assert.NoError(t, err) assignment8 := watchAssignment(t, watch) assert.NotEqual(t, "id6", assignment8.NodeID) }
// signNodeCert does the bulk of the work for signing a certificate func (s *Server) signNodeCert(ctx context.Context, node *api.Node) { if !s.securityConfig.RootCA().CanSign() { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "method": "(*Server).signNodeCert", }).Errorf("no valid signer found") return } node = node.Copy() nodeID := node.ID // Convert the role from proto format role, err := ParseRole(node.Certificate.Role) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("failed to parse role") return } // Attempt to sign the CSR cert, err := s.securityConfig.RootCA().ParseValidateAndSignCSR(node.Certificate.CSR, node.Certificate.CN, role, s.securityConfig.ClientTLSCreds.Organization()) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("failed to sign CSR") // If this error is due the lack of signer, maybe some other // manager in the future will pick it up. Return without // changing the state of the certificate. if err == ErrNoValidSigner { return } // If the current state is already Failed, no need to change it if node.Certificate.Status.State == api.IssuanceStateFailed { return } // We failed to sign this CSR, change the state to FAILED err = s.store.Update(func(tx store.Tx) error { node := store.GetNode(tx, nodeID) if node == nil { return fmt.Errorf("node %s not found", nodeID) } node.Certificate.Status = api.IssuanceStatus{ State: api.IssuanceStateFailed, Err: err.Error(), } return store.UpdateNode(tx, node) }) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": nodeID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("transaction failed when setting state to FAILED") } return } // We were able to successfully sign the new CSR. Let's try to update the nodeStore for { err = s.store.Update(func(tx store.Tx) error { // Remote nodes are expecting a full certificate chain, not just a signed certificate node.Certificate.Certificate = append(cert, s.securityConfig.RootCA().Cert...) node.Certificate.Status = api.IssuanceStatus{ State: api.IssuanceStateIssued, } err := store.UpdateNode(tx, node) if err != nil { node = store.GetNode(tx, nodeID) if node == nil { err = fmt.Errorf("node %s does not exist", nodeID) } } return err }) if err == nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "node.role": node.Certificate.Role, "method": "(*Server).signNodeCert", }).Debugf("certificate issued") break } if err == store.ErrSequenceConflict { continue } log.G(ctx).WithFields(logrus.Fields{ "node.id": nodeID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("transaction failed") return } }