func (rm *roleManager) reconcileRole(node *api.Node) { if node.Role == node.Spec.DesiredRole { // Nothing to do. delete(rm.pending, node.ID) return } // Promotion can proceed right away. if node.Spec.DesiredRole == api.NodeRoleManager && node.Role == api.NodeRoleWorker { err := rm.store.Update(func(tx store.Tx) error { updatedNode := store.GetNode(tx, node.ID) if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role { return nil } updatedNode.Role = api.NodeRoleManager return store.UpdateNode(tx, updatedNode) }) if err != nil { log.L.WithError(err).Errorf("failed to promote node %s", node.ID) } else { delete(rm.pending, node.ID) } } else if node.Spec.DesiredRole == api.NodeRoleWorker && node.Role == api.NodeRoleManager { // Check for node in memberlist member := rm.raft.GetMemberByNodeID(node.ID) if member != nil { // Quorum safeguard if !rm.raft.CanRemoveMember(member.RaftID) { // TODO(aaronl): Retry later log.L.Debugf("can't demote node %s at this time: removing member from raft would result in a loss of quorum", node.ID) return } rmCtx, rmCancel := context.WithTimeout(rm.ctx, 5*time.Second) defer rmCancel() if err := rm.raft.RemoveMember(rmCtx, member.RaftID); err != nil { // TODO(aaronl): Retry later log.L.WithError(err).Debugf("can't demote node %s at this time", node.ID) return } } err := rm.store.Update(func(tx store.Tx) error { updatedNode := store.GetNode(tx, node.ID) if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role { return nil } updatedNode.Role = api.NodeRoleWorker return store.UpdateNode(tx, updatedNode) }) if err != nil { log.L.WithError(err).Errorf("failed to demote node %s", node.ID) } else { delete(rm.pending, node.ID) } } }
// CheckValuesOnNodes checks that all the nodes in the cluster have the same // replicated data, generally used to check if a node can catch up with the logs // correctly func CheckValuesOnNodes(t *testing.T, clockSource *fakeclock.FakeClock, checkNodes map[uint64]*TestNode, ids []string, values []*api.Node) { iteration := 0 for checkNodeID, node := range checkNodes { assert.NoError(t, PollFunc(clockSource, func() error { var err error node.MemoryStore().View(func(tx store.ReadTx) { var allNodes []*api.Node allNodes, err = store.FindNodes(tx, store.All) if err != nil { return } for i, id := range ids { n := store.GetNode(tx, id) if n == nil { err = fmt.Errorf("node %s not found on %d (iteration %d)", id, checkNodeID, iteration) return } if !reflect.DeepEqual(values[i], n) { err = fmt.Errorf("node %s did not match expected value on %d (iteration %d)", id, checkNodeID, iteration) return } } if len(allNodes) != len(ids) { err = fmt.Errorf("expected %d nodes, got %d (iteration %d)", len(ids), len(allNodes), iteration) return } }) return err })) iteration++ } }
// RemoveNode removes a Node referenced by NodeID with the given NodeSpec. // - Returns NotFound if the Node is not found. // - Returns FailedPrecondition if the Node has manager role (and is part of the memberlist) or is not shut down. // - Returns InvalidArgument if NodeID or NodeVersion is not valid. // - Returns an error if the delete fails. func (s *Server) RemoveNode(ctx context.Context, request *api.RemoveNodeRequest) (*api.RemoveNodeResponse, error) { if request.NodeID == "" { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } err := s.store.Update(func(tx store.Tx) error { node := store.GetNode(tx, request.NodeID) if node == nil { return grpc.Errorf(codes.NotFound, "node %s not found", request.NodeID) } if node.Spec.Role == api.NodeRoleManager { if s.raft == nil { return grpc.Errorf(codes.FailedPrecondition, "node %s is a manager but cannot access node information from the raft memberlist", request.NodeID) } if member := s.raft.GetMemberByNodeID(request.NodeID); member != nil { return grpc.Errorf(codes.FailedPrecondition, "node %s is a cluster manager and is a member of the raft cluster. It must be demoted to worker before removal", request.NodeID) } } if !request.Force && node.Status.State == api.NodeStatus_READY { return grpc.Errorf(codes.FailedPrecondition, "node %s is not down and can't be removed", request.NodeID) } return store.DeleteNode(tx, request.NodeID) }) if err != nil { return nil, err } return &api.RemoveNodeResponse{}, nil }
// GetNode returns a Node given a NodeID. // - Returns `InvalidArgument` if NodeID is not provided. // - Returns `NotFound` if the Node is not found. func (s *Server) GetNode(ctx context.Context, request *api.GetNodeRequest) (*api.GetNodeResponse, error) { if request.NodeID == "" { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } var node *api.Node s.store.View(func(tx store.ReadTx) { node = store.GetNode(tx, request.NodeID) }) if node == nil { return nil, grpc.Errorf(codes.NotFound, "node %s not found", request.NodeID) } if s.raft != nil { memberlist := s.raft.GetMemberlist() for _, member := range memberlist { if member.NodeID == node.ID { node.ManagerStatus = &api.ManagerStatus{ RaftID: member.RaftID, Addr: member.Addr, Leader: member.Status.Leader, Reachability: member.Status.Reachability, } break } } } return &api.GetNodeResponse{ Node: node, }, nil }
func TestRenewTLSConfigWithNoNode(t *testing.T) { t.Parallel() tc := testutils.NewTestCA(t) defer tc.Stop() ctx, cancel := context.WithCancel(context.Background()) defer cancel() // Get a new nodeConfig with a TLS cert that has the default Cert duration nodeConfig, err := tc.WriteNewNodeConfig(ca.ManagerRole) assert.NoError(t, err) // Create a new RootCA, and change the policy to issue 6 minute certificates. // Because of the default backdate of 5 minutes, this issues certificates // valid for 1 minute. newRootCA, err := ca.NewRootCA(tc.RootCA.Cert, tc.RootCA.Key, ca.DefaultNodeCertExpiration) assert.NoError(t, err) newRootCA.Signer.SetPolicy(&cfconfig.Signing{ Default: &cfconfig.SigningProfile{ Usage: []string{"signing", "key encipherment", "server auth", "client auth"}, Expiry: 6 * time.Minute, }, }) // Create a new CSR and overwrite the key on disk csr, key, err := ca.GenerateNewCSR() assert.NoError(t, err) // Issue a new certificate with the same details as the current config, but with 1 min expiration time c := nodeConfig.ClientTLSCreds signedCert, err := newRootCA.ParseValidateAndSignCSR(csr, c.NodeID(), c.Role(), c.Organization()) assert.NoError(t, err) assert.NotNil(t, signedCert) // Overwrite the certificate on disk with one that expires in 1 minute err = ioutils.AtomicWriteFile(tc.Paths.Node.Cert, signedCert, 0644) assert.NoError(t, err) err = ioutils.AtomicWriteFile(tc.Paths.Node.Key, key, 0600) assert.NoError(t, err) // Delete the node from the backend store err = tc.MemoryStore.Update(func(tx store.Tx) error { node := store.GetNode(tx, nodeConfig.ClientTLSCreds.NodeID()) assert.NotNil(t, node) return store.DeleteNode(tx, nodeConfig.ClientTLSCreds.NodeID()) }) assert.NoError(t, err) renew := make(chan struct{}) updates := ca.RenewTLSConfig(ctx, nodeConfig, tc.Remotes, renew) select { case <-time.After(10 * time.Second): assert.Fail(t, "TestRenewTLSConfig timed-out") case certUpdate := <-updates: assert.Error(t, certUpdate.Err) assert.Contains(t, certUpdate.Err.Error(), "not found when attempting to renew certificate") } }
// RemoveNode updates a Node referenced by NodeID with the given NodeSpec. // - Returns NotFound if the Node is not found. // - Returns FailedPrecondition if the Node has manager role or not shut down. // - Returns InvalidArgument if NodeID or NodeVersion is not valid. // - Returns an error if the delete fails. func (s *Server) RemoveNode(ctx context.Context, request *api.RemoveNodeRequest) (*api.RemoveNodeResponse, error) { if request.NodeID == "" { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } if s.raft != nil { memberlist := s.raft.GetMemberlist() for _, member := range memberlist { if member.NodeID == request.NodeID { return nil, grpc.Errorf(codes.FailedPrecondition, "node %s is a cluster manager and is part of the quorum. It must be demoted to worker before removal", request.NodeID) } } } err := s.store.Update(func(tx store.Tx) error { node := store.GetNode(tx, request.NodeID) if node == nil { return grpc.Errorf(codes.NotFound, "node %s not found", request.NodeID) } if node.Spec.Role == api.NodeRoleManager { return grpc.Errorf(codes.FailedPrecondition, "node %s role is set to manager. It should be demoted to worker for safe removal", request.NodeID) } if node.Status.State == api.NodeStatus_READY { return grpc.Errorf(codes.FailedPrecondition, "node %s is not down and can't be removed", request.NodeID) } return store.DeleteNode(tx, request.NodeID) }) if err != nil { return nil, err } return &api.RemoveNodeResponse{}, nil }
func (r *Orchestrator) handleTaskChange(ctx context.Context, t *api.Task) { // If we already set the desired state past TaskStateRunning, there is no // further action necessary. if t.DesiredState > api.TaskStateRunning { return } var ( n *api.Node service *api.Service ) r.store.View(func(tx store.ReadTx) { if t.NodeID != "" { n = store.GetNode(tx, t.NodeID) } if t.ServiceID != "" { service = store.GetService(tx, t.ServiceID) } }) if !orchestrator.IsReplicatedService(service) { return } if t.Status.State > api.TaskStateRunning || (t.NodeID != "" && invalidNode(n)) { r.restartTasks[t.ID] = struct{}{} } }
// GetNode returns a Node given a NodeID. // - Returns `InvalidArgument` if NodeID is not provided. // - Returns `NotFound` if the Node is not found. func (s *Server) GetNode(ctx context.Context, request *api.GetNodeRequest) (*api.GetNodeResponse, error) { if request.NodeID == "" { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } var node *api.Node s.store.View(func(tx store.ReadTx) { node = store.GetNode(tx, request.NodeID) }) if node == nil { return nil, grpc.Errorf(codes.NotFound, "node %s not found", request.NodeID) } if s.raft != nil { memberlist := s.raft.GetMemberlist() raftID, err := identity.ParseNodeID(request.NodeID) if err == nil && memberlist[raftID] != nil { node.ManagerStatus = &api.ManagerStatus{Raft: *memberlist[raftID]} } } return &api.GetNodeResponse{ Node: node, }, nil }
func (a *Allocator) allocateNode(ctx context.Context, nc *networkContext, node *api.Node) error { if err := nc.nwkAllocator.AllocateNode(node); err != nil { return err } if err := a.store.Update(func(tx store.Tx) error { for { err := store.UpdateNode(tx, node) if err != nil && err != store.ErrSequenceConflict { return fmt.Errorf("failed updating state in store transaction for node %s: %v", node.ID, err) } if err == store.ErrSequenceConflict { storeNode := store.GetNode(tx, node.ID) storeNode.Attachment = node.Attachment.Copy() node = storeNode continue } break } return nil }); err != nil { if err := nc.nwkAllocator.DeallocateNode(node); err != nil { log.G(ctx).WithError(err).Errorf("failed rolling back allocation of node %s: %v", node.ID, err) } return err } return nil }
func testRaftRestartCluster(t *testing.T, stagger bool) { nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // Propose a value values := make([]*api.Node, 2) var err error values[0], err = raftutils.ProposeValue(t, nodes[1], "id1") assert.NoError(t, err, "failed to propose value") // Stop all nodes for _, node := range nodes { node.Server.Stop() node.Shutdown() } raftutils.AdvanceTicks(clockSource, 5) // Restart all nodes i := 0 for k, node := range nodes { if stagger && i != 0 { raftutils.AdvanceTicks(clockSource, 1) } nodes[k] = raftutils.RestartNode(t, clockSource, node, false) i++ } raftutils.WaitForCluster(t, clockSource, nodes) // Propose another value values[1], err = raftutils.ProposeValue(t, raftutils.Leader(nodes), "id2") assert.NoError(t, err, "failed to propose value") for _, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { var err error node.MemoryStore().View(func(tx store.ReadTx) { var allNodes []*api.Node allNodes, err = store.FindNodes(tx, store.All) if err != nil { return } if len(allNodes) != 2 { err = fmt.Errorf("expected 2 nodes, got %d", len(allNodes)) return } for i, nodeID := range []string{"id1", "id2"} { n := store.GetNode(tx, nodeID) if !reflect.DeepEqual(n, values[i]) { err = fmt.Errorf("node %s did not match expected value", nodeID) return } } }) return err })) } }
// markNodeNotReady sets the node state to some state other than READY func (d *Dispatcher) markNodeNotReady(id string, state api.NodeStatus_State, message string) error { if err := d.isRunningLocked(); err != nil { return err } // Node is down. Add it to down nodes so that we can keep // track of tasks assigned to the node. var ( node *api.Node err error ) d.store.View(func(readTx store.ReadTx) { node = store.GetNode(readTx, id) if node == nil { err = fmt.Errorf("could not find node %s while trying to add to down nodes store", id) } }) if err != nil { return err } expireFunc := func() { if err := d.moveTasksToOrphaned(id); err != nil { log.G(context.TODO()).WithError(err).Error(`failed to move all tasks to "ORPHANED" state`) } d.downNodes.Delete(id) } d.downNodes.Add(node, expireFunc) status := &api.NodeStatus{ State: state, Message: message, } d.nodeUpdatesLock.Lock() // pluck the description out of nodeUpdates. this protects against a case // where a node is marked ready and a description is added, but then the // node is immediately marked not ready. this preserves that description d.nodeUpdates[id] = nodeUpdate{status: status, description: d.nodeUpdates[id].description} numUpdates := len(d.nodeUpdates) d.nodeUpdatesLock.Unlock() if numUpdates >= maxBatchItems { select { case d.processUpdatesTrigger <- struct{}{}: case <-d.ctx.Done(): } } if rn := d.nodes.Delete(id); rn == nil { return errors.Errorf("node %s is not found in local storage", id) } return nil }
// UpdateNode updates a Node referenced by NodeID with the given NodeSpec. // - Returns `NotFound` if the Node is not found. // - Returns `InvalidArgument` if the NodeSpec is malformed. // - Returns an error if the update fails. func (s *Server) UpdateNode(ctx context.Context, request *api.UpdateNodeRequest) (*api.UpdateNodeResponse, error) { if request.NodeID == "" || request.NodeVersion == nil { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } if err := validateNodeSpec(request.Spec); err != nil { return nil, err } var ( node *api.Node demote bool ) err := s.store.Update(func(tx store.Tx) error { node = store.GetNode(tx, request.NodeID) if node == nil { return nil } // Demotion sanity checks. if node.Spec.Role == api.NodeRoleManager && request.Spec.Role == api.NodeRoleWorker { demote = true managers, err := store.FindNodes(tx, store.ByRole(api.NodeRoleManager)) if err != nil { return grpc.Errorf(codes.Internal, "internal store error: %v", err) } if len(managers) == 1 && managers[0].ID == node.ID { return grpc.Errorf(codes.FailedPrecondition, "attempting to demote the last manager of the swarm") } } node.Meta.Version = *request.NodeVersion node.Spec = *request.Spec.Copy() return store.UpdateNode(tx, node) }) if err != nil { return nil, err } if node == nil { return nil, grpc.Errorf(codes.NotFound, "node %s not found", request.NodeID) } if demote && s.raft != nil { memberlist := s.raft.GetMemberlist() for raftID, member := range memberlist { if member.NodeID == request.NodeID { if err := s.raft.RemoveMember(ctx, raftID); err != nil { return nil, err } break } } } return &api.UpdateNodeResponse{ Node: node, }, nil }
// UpdateNode updates a Node referenced by NodeID with the given NodeSpec. // - Returns `NotFound` if the Node is not found. // - Returns `InvalidArgument` if the NodeSpec is malformed. // - Returns an error if the update fails. func (s *Server) UpdateNode(ctx context.Context, request *api.UpdateNodeRequest) (*api.UpdateNodeResponse, error) { if request.NodeID == "" || request.NodeVersion == nil { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } if err := validateNodeSpec(request.Spec); err != nil { return nil, err } var ( node *api.Node member *membership.Member ) err := s.store.Update(func(tx store.Tx) error { node = store.GetNode(tx, request.NodeID) if node == nil { return grpc.Errorf(codes.NotFound, "node %s not found", request.NodeID) } // Demotion sanity checks. if node.Spec.DesiredRole == api.NodeRoleManager && request.Spec.DesiredRole == api.NodeRoleWorker { // Check for manager entries in Store. managers, err := store.FindNodes(tx, store.ByRole(api.NodeRoleManager)) if err != nil { return grpc.Errorf(codes.Internal, "internal store error: %v", err) } if len(managers) == 1 && managers[0].ID == node.ID { return grpc.Errorf(codes.FailedPrecondition, "attempting to demote the last manager of the swarm") } // Check for node in memberlist if member = s.raft.GetMemberByNodeID(request.NodeID); member == nil { return grpc.Errorf(codes.NotFound, "can't find manager in raft memberlist") } // Quorum safeguard if !s.raft.CanRemoveMember(member.RaftID) { return grpc.Errorf(codes.FailedPrecondition, "can't remove member from the raft: this would result in a loss of quorum") } } node.Meta.Version = *request.NodeVersion node.Spec = *request.Spec.Copy() return store.UpdateNode(tx, node) }) if err != nil { return nil, err } return &api.UpdateNodeResponse{ Node: node, }, nil }
func (d *Dispatcher) markNodesUnknown(ctx context.Context) error { log := log.G(ctx).WithField("method", "(*Dispatcher).markNodesUnknown") var nodes []*api.Node var err error d.store.View(func(tx store.ReadTx) { nodes, err = store.FindNodes(tx, store.All) }) if err != nil { return fmt.Errorf("failed to get list of nodes: %v", err) } _, err = d.store.Batch(func(batch *store.Batch) error { for _, n := range nodes { err := batch.Update(func(tx store.Tx) error { // check if node is still here node := store.GetNode(tx, n.ID) if node == nil { return nil } // do not try to resurrect down nodes if node.Status.State == api.NodeStatus_DOWN { return nil } node.Status = api.NodeStatus{ State: api.NodeStatus_UNKNOWN, Message: `Node moved to "unknown" state due to leadership change in cluster`, } nodeID := node.ID expireFunc := func() { log := log.WithField("node", nodeID) nodeStatus := api.NodeStatus{State: api.NodeStatus_DOWN, Message: `heartbeat failure for node in "unknown" state`} log.Debugf("heartbeat expiration for unknown node") if err := d.nodeRemove(nodeID, nodeStatus); err != nil { log.WithError(err).Errorf(`failed deregistering node after heartbeat expiration for node in "unknown" state`) } } if err := d.nodes.AddUnknown(node, expireFunc); err != nil { return fmt.Errorf(`adding node in "unknown" state to node store failed: %v`, err) } if err := store.UpdateNode(tx, node); err != nil { return fmt.Errorf("update failed %v", err) } return nil }) if err != nil { log.WithField("node", n.ID).WithError(err).Errorf(`failed to move node to "unknown" state`) } } return nil }) return err }
// register is used for registration of node with particular dispatcher. func (d *Dispatcher) register(ctx context.Context, nodeID string, description *api.NodeDescription) (string, error) { // prevent register until we're ready to accept it if err := d.isRunningLocked(); err != nil { return "", err } if err := d.nodes.CheckRateLimit(nodeID); err != nil { return "", err } // create or update node in store // TODO(stevvooe): Validate node specification. var node *api.Node err := d.store.Update(func(tx store.Tx) error { node = store.GetNode(tx, nodeID) if node == nil { return ErrNodeNotFound } node.Description = description node.Status = api.NodeStatus{ State: api.NodeStatus_READY, } return store.UpdateNode(tx, node) }) if err != nil { return "", err } expireFunc := func() { nodeStatus := api.NodeStatus{State: api.NodeStatus_DOWN, Message: "heartbeat failure"} log.G(ctx).Debugf("heartbeat expiration") if err := d.nodeRemove(nodeID, nodeStatus); err != nil { log.G(ctx).WithError(err).Errorf("failed deregistering node after heartbeat expiration") } } rn := d.nodes.Add(node, expireFunc) // NOTE(stevvooe): We need be a little careful with re-registration. The // current implementation just matches the node id and then gives away the // sessionID. If we ever want to use sessionID as a secret, which we may // want to, this is giving away the keys to the kitchen. // // The right behavior is going to be informed by identity. Basically, each // time a node registers, we invalidate the session and issue a new // session, once identity is proven. This will cause misbehaved agents to // be kicked when multiple connections are made. return rn.SessionID, nil }
// issueRenewCertificate receives a nodeID and a CSR and modifies the node's certificate entry with the new CSR // and changes the state to RENEW, so it can be picked up and signed by the signing reconciliation loop func (s *Server) issueRenewCertificate(ctx context.Context, nodeID string, csr []byte) (*api.IssueNodeCertificateResponse, error) { var ( cert api.Certificate node *api.Node ) err := s.store.Update(func(tx store.Tx) error { // Attempt to retrieve the node with nodeID node = store.GetNode(tx, nodeID) if node == nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": nodeID, "method": "issueRenewCertificate", }).Warnf("node does not exist") // If this node doesn't exist, we shouldn't be renewing a certificate for it return grpc.Errorf(codes.NotFound, "node %s not found when attempting to renew certificate", nodeID) } // Create a new Certificate entry for this node with the new CSR and a RENEW state cert = api.Certificate{ CSR: csr, CN: node.ID, Role: node.Spec.Role, Status: api.IssuanceStatus{ State: api.IssuanceStateRenew, }, } node.Certificate = cert return store.UpdateNode(tx, node) }) if err != nil { return nil, err } log.G(ctx).WithFields(logrus.Fields{ "cert.cn": cert.CN, "cert.role": cert.Role, "method": "issueRenewCertificate", }).Debugf("node certificate updated") return &api.IssueNodeCertificateResponse{ NodeID: nodeID, NodeMembership: node.Spec.Membership, }, nil }
func TestForceRotationIsNoop(t *testing.T) { tc := testutils.NewTestCA(t) defer tc.Stop() // Get a new Certificate issued csr, _, err := ca.GenerateNewCSR() assert.NoError(t, err) issueRequest := &api.IssueNodeCertificateRequest{CSR: csr, Token: tc.WorkerToken} issueResponse, err := tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.NoError(t, err) assert.NotNil(t, issueResponse.NodeID) assert.Equal(t, api.NodeMembershipAccepted, issueResponse.NodeMembership) // Check that the Certificate is successfully issued statusRequest := &api.NodeCertificateStatusRequest{NodeID: issueResponse.NodeID} statusResponse, err := tc.NodeCAClients[0].NodeCertificateStatus(context.Background(), statusRequest) require.NoError(t, err) assert.Equal(t, api.IssuanceStateIssued, statusResponse.Status.State) assert.NotNil(t, statusResponse.Certificate.Certificate) assert.Equal(t, api.NodeRoleWorker, statusResponse.Certificate.Role) // Update the certificate status to IssuanceStateRotate which should be a server-side noop err = tc.MemoryStore.Update(func(tx store.Tx) error { // Attempt to retrieve the node with nodeID node := store.GetNode(tx, issueResponse.NodeID) assert.NotNil(t, node) node.Certificate.Status.State = api.IssuanceStateRotate return store.UpdateNode(tx, node) }) assert.NoError(t, err) // Wait a bit and check that the certificate hasn't changed/been reissued time.Sleep(250 * time.Millisecond) statusNewResponse, err := tc.NodeCAClients[0].NodeCertificateStatus(context.Background(), statusRequest) require.NoError(t, err) assert.Equal(t, statusResponse.Certificate.Certificate, statusNewResponse.Certificate.Certificate) assert.Equal(t, api.IssuanceStateRotate, statusNewResponse.Certificate.Status.State) assert.Equal(t, api.NodeRoleWorker, statusNewResponse.Certificate.Role) }
func (a *Allocator) commitAllocatedNode(ctx context.Context, batch *store.Batch, node *api.Node) error { if err := batch.Update(func(tx store.Tx) error { err := store.UpdateNode(tx, node) if err == store.ErrSequenceConflict { storeNode := store.GetNode(tx, node.ID) storeNode.Attachment = node.Attachment.Copy() err = store.UpdateNode(tx, storeNode) } return errors.Wrapf(err, "failed updating state in store transaction for node %s", node.ID) }); err != nil { if err := a.netCtx.nwkAllocator.DeallocateNode(node); err != nil { log.G(ctx).WithError(err).Errorf("failed rolling back allocation of node %s", node.ID) } return err } return nil }
func (d *Dispatcher) nodeRemove(id string, status api.NodeStatus) error { if err := d.isRunningLocked(); err != nil { return err } // TODO(aaronl): Is it worth batching node removals? err := d.store.Update(func(tx store.Tx) error { node := store.GetNode(tx, id) if node == nil { return errors.New("node not found") } node.Status = status return store.UpdateNode(tx, node) }) if err != nil { return fmt.Errorf("failed to update node %s status to down: %v", id, err) } if rn := d.nodes.Delete(id); rn == nil { return fmt.Errorf("node %s is not found in local storage", id) } return nil }
// signNodeCert does the bulk of the work for signing a certificate func (s *Server) signNodeCert(ctx context.Context, node *api.Node) { rootCA := s.securityConfig.RootCA() externalCA := s.securityConfig.externalCA node = node.Copy() nodeID := node.ID // Convert the role from proto format role, err := ParseRole(node.Certificate.Role) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("failed to parse role") return } // Attempt to sign the CSR var ( rawCSR = node.Certificate.CSR cn = node.Certificate.CN ou = role org = s.securityConfig.ClientTLSCreds.Organization() ) // Try using the external CA first. cert, err := externalCA.Sign(PrepareCSR(rawCSR, cn, ou, org)) if err == ErrNoExternalCAURLs { // No external CA servers configured. Try using the local CA. cert, err = rootCA.ParseValidateAndSignCSR(rawCSR, cn, ou, org) } if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("failed to sign CSR") // If this error is due the lack of signer, maybe some other // manager in the future will pick it up. Return without // changing the state of the certificate. if err == ErrNoValidSigner { return } // If the current state is already Failed, no need to change it if node.Certificate.Status.State == api.IssuanceStateFailed { return } // We failed to sign this CSR, change the state to FAILED err = s.store.Update(func(tx store.Tx) error { node := store.GetNode(tx, nodeID) if node == nil { return fmt.Errorf("node %s not found", nodeID) } node.Certificate.Status = api.IssuanceStatus{ State: api.IssuanceStateFailed, Err: err.Error(), } return store.UpdateNode(tx, node) }) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": nodeID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("transaction failed when setting state to FAILED") } return } // We were able to successfully sign the new CSR. Let's try to update the nodeStore for { err = s.store.Update(func(tx store.Tx) error { node.Certificate.Certificate = cert node.Certificate.Status = api.IssuanceStatus{ State: api.IssuanceStateIssued, } err := store.UpdateNode(tx, node) if err != nil { node = store.GetNode(tx, nodeID) if node == nil { err = fmt.Errorf("node %s does not exist", nodeID) } } return err }) if err == nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "node.role": node.Certificate.Role, "method": "(*Server).signNodeCert", }).Debugf("certificate issued") break } if err == store.ErrSequenceConflict { continue } log.G(ctx).WithFields(logrus.Fields{ "node.id": nodeID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("transaction failed") return } }
func (r *Orchestrator) initTasks(ctx context.Context, readTx store.ReadTx) error { tasks, err := store.FindTasks(readTx, store.All) if err != nil { return err } for _, t := range tasks { if t.NodeID != "" { n := store.GetNode(readTx, t.NodeID) if invalidNode(n) && t.Status.State <= api.TaskStateRunning && t.DesiredState <= api.TaskStateRunning { r.restartTasks[t.ID] = struct{}{} } } } _, err = r.store.Batch(func(batch *store.Batch) error { for _, t := range tasks { if t.ServiceID == "" { continue } // TODO(aluzzardi): We should NOT retrieve the service here. service := store.GetService(readTx, t.ServiceID) if service == nil { // Service was deleted err := batch.Update(func(tx store.Tx) error { return store.DeleteTask(tx, t.ID) }) if err != nil { log.G(ctx).WithError(err).Error("failed to set task desired state to dead") } continue } // TODO(aluzzardi): This is shady. We should have a more generic condition. if t.DesiredState != api.TaskStateReady || !orchestrator.IsReplicatedService(service) { continue } restartDelay := orchestrator.DefaultRestartDelay if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil { var err error restartDelay, err = gogotypes.DurationFromProto(t.Spec.Restart.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid restart delay") restartDelay = orchestrator.DefaultRestartDelay } } if restartDelay != 0 { timestamp, err := gogotypes.TimestampFromProto(t.Status.Timestamp) if err == nil { restartTime := timestamp.Add(restartDelay) calculatedRestartDelay := restartTime.Sub(time.Now()) if calculatedRestartDelay < restartDelay { restartDelay = calculatedRestartDelay } if restartDelay > 0 { _ = batch.Update(func(tx store.Tx) error { t := store.GetTask(tx, t.ID) // TODO(aluzzardi): This is shady as well. We should have a more generic condition. if t == nil || t.DesiredState != api.TaskStateReady { return nil } r.restarts.DelayStart(ctx, tx, nil, t.ID, restartDelay, true) return nil }) continue } } else { log.G(ctx).WithError(err).Error("invalid status timestamp") } } // Start now err := batch.Update(func(tx store.Tx) error { return r.restarts.StartNow(tx, t.ID) }) if err != nil { log.G(ctx).WithError(err).WithField("task.id", t.ID).Error("moving task out of delayed state failed") } } return nil }) return err }
func TestConstraintEnforcer(t *testing.T) { nodes := []*api.Node{ { ID: "id1", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name1", }, Availability: api.NodeAvailabilityActive, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, Role: api.NodeRoleWorker, }, { ID: "id2", Spec: api.NodeSpec{ Annotations: api.Annotations{ Name: "name2", }, Availability: api.NodeAvailabilityActive, }, Status: api.NodeStatus{ State: api.NodeStatus_READY, }, Description: &api.NodeDescription{ Resources: &api.Resources{ NanoCPUs: 1e9, MemoryBytes: 1e9, }, }, }, } tasks := []*api.Task{ { ID: "id0", DesiredState: api.TaskStateRunning, Spec: api.TaskSpec{ Placement: &api.Placement{ Constraints: []string{"node.role == manager"}, }, }, Status: api.TaskStatus{ State: api.TaskStateNew, }, NodeID: "id1", }, { ID: "id1", DesiredState: api.TaskStateRunning, Status: api.TaskStatus{ State: api.TaskStateNew, }, NodeID: "id1", }, { ID: "id2", DesiredState: api.TaskStateRunning, Spec: api.TaskSpec{ Placement: &api.Placement{ Constraints: []string{"node.role == worker"}, }, }, Status: api.TaskStatus{ State: api.TaskStateRunning, }, NodeID: "id1", }, { ID: "id3", DesiredState: api.TaskStateNew, Status: api.TaskStatus{ State: api.TaskStateNew, }, NodeID: "id2", }, { ID: "id4", DesiredState: api.TaskStateReady, Spec: api.TaskSpec{ Resources: &api.ResourceRequirements{ Reservations: &api.Resources{ MemoryBytes: 9e8, }, }, }, Status: api.TaskStatus{ State: api.TaskStatePending, }, NodeID: "id2", }, } s := store.NewMemoryStore(nil) assert.NotNil(t, s) defer s.Close() err := s.Update(func(tx store.Tx) error { // Prepoulate nodes for _, n := range nodes { assert.NoError(t, store.CreateNode(tx, n)) } // Prepopulate tasks for _, task := range tasks { assert.NoError(t, store.CreateTask(tx, task)) } return nil }) assert.NoError(t, err) watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{}) defer cancel() constraintEnforcer := New(s) defer constraintEnforcer.Stop() go constraintEnforcer.Run() // id0 should be killed immediately shutdown1 := testutils.WatchShutdownTask(t, watch) assert.Equal(t, "id0", shutdown1.ID) // Change node id1 to a manager err = s.Update(func(tx store.Tx) error { node := store.GetNode(tx, "id1") if node == nil { t.Fatal("could not get node id1") } node.Role = api.NodeRoleManager assert.NoError(t, store.UpdateNode(tx, node)) return nil }) assert.NoError(t, err) shutdown2 := testutils.WatchShutdownTask(t, watch) assert.Equal(t, "id2", shutdown2.ID) // Change resources on node id2 err = s.Update(func(tx store.Tx) error { node := store.GetNode(tx, "id2") if node == nil { t.Fatal("could not get node id2") } node.Description.Resources.MemoryBytes = 5e8 assert.NoError(t, store.UpdateNode(tx, node)) return nil }) assert.NoError(t, err) shutdown3 := testutils.WatchShutdownTask(t, watch) assert.Equal(t, "id4", shutdown3.ID) }
// Session is a stream which controls agent connection. // Each message contains list of backup Managers with weights. Also there is // a special boolean field Disconnect which if true indicates that node should // reconnect to another Manager immediately. func (d *Dispatcher) Session(r *api.SessionRequest, stream api.Dispatcher_SessionServer) error { ctx := stream.Context() nodeInfo, err := ca.RemoteNode(ctx) if err != nil { return err } nodeID := nodeInfo.NodeID if err := d.isRunningLocked(); err != nil { return err } // register the node. sessionID, err := d.register(stream.Context(), nodeID, r.Description) if err != nil { return err } fields := logrus.Fields{ "node.id": nodeID, "node.session": sessionID, "method": "(*Dispatcher).Session", } if nodeInfo.ForwardedBy != nil { fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID } log := log.G(ctx).WithFields(fields) var nodeObj *api.Node nodeUpdates, cancel, err := store.ViewAndWatch(d.store, func(readTx store.ReadTx) error { nodeObj = store.GetNode(readTx, nodeID) return nil }, state.EventUpdateNode{Node: &api.Node{ID: nodeID}, Checks: []state.NodeCheckFunc{state.NodeCheckID}}, ) if cancel != nil { defer cancel() } if err != nil { log.WithError(err).Error("ViewAndWatch Node failed") } if _, err = d.nodes.GetWithSession(nodeID, sessionID); err != nil { return err } if err := stream.Send(&api.SessionMessage{ SessionID: sessionID, Node: nodeObj, Managers: d.getManagers(), NetworkBootstrapKeys: d.networkBootstrapKeys, }); err != nil { return err } managerUpdates, mgrCancel := d.mgrQueue.Watch() defer mgrCancel() keyMgrUpdates, keyMgrCancel := d.keyMgrQueue.Watch() defer keyMgrCancel() // disconnectNode is a helper forcibly shutdown connection disconnectNode := func() error { // force disconnect by shutting down the stream. transportStream, ok := transport.StreamFromContext(stream.Context()) if ok { // if we have the transport stream, we can signal a disconnect // in the client. if err := transportStream.ServerTransport().Close(); err != nil { log.WithError(err).Error("session end") } } nodeStatus := api.NodeStatus{State: api.NodeStatus_DISCONNECTED, Message: "node is currently trying to find new manager"} if err := d.nodeRemove(nodeID, nodeStatus); err != nil { log.WithError(err).Error("failed to remove node") } // still return an abort if the transport closure was ineffective. return grpc.Errorf(codes.Aborted, "node must disconnect") } for { // After each message send, we need to check the nodes sessionID hasn't // changed. If it has, we will the stream and make the node // re-register. node, err := d.nodes.GetWithSession(nodeID, sessionID) if err != nil { return err } var mgrs []*api.WeightedPeer var disconnect bool select { case ev := <-managerUpdates: mgrs = ev.([]*api.WeightedPeer) case ev := <-nodeUpdates: nodeObj = ev.(state.EventUpdateNode).Node case <-stream.Context().Done(): return stream.Context().Err() case <-node.Disconnect: disconnect = true case <-d.ctx.Done(): disconnect = true case <-keyMgrUpdates: } if mgrs == nil { mgrs = d.getManagers() } if err := stream.Send(&api.SessionMessage{ SessionID: sessionID, Node: nodeObj, Managers: mgrs, NetworkBootstrapKeys: d.networkBootstrapKeys, }); err != nil { return err } if disconnect { return disconnectNode() } } }
func (s *Scheduler) applySchedulingDecisions(ctx context.Context, schedulingDecisions map[string]schedulingDecision) (successful, failed []schedulingDecision) { if len(schedulingDecisions) == 0 { return } successful = make([]schedulingDecision, 0, len(schedulingDecisions)) // Apply changes to master store applied, err := s.store.Batch(func(batch *store.Batch) error { for len(schedulingDecisions) > 0 { err := batch.Update(func(tx store.Tx) error { // Update exactly one task inside this Update // callback. for taskID, decision := range schedulingDecisions { delete(schedulingDecisions, taskID) t := store.GetTask(tx, taskID) if t == nil { // Task no longer exists nodeInfo, err := s.nodeSet.nodeInfo(decision.new.NodeID) if err == nil && nodeInfo.removeTask(decision.new) { s.nodeSet.updateNode(nodeInfo) } delete(s.allTasks, decision.old.ID) continue } if t.Status.State == decision.new.Status.State && t.Status.Message == decision.new.Status.Message { // No changes, ignore continue } if t.Status.State >= api.TaskStateAssigned { nodeInfo, err := s.nodeSet.nodeInfo(decision.new.NodeID) if err != nil { failed = append(failed, decision) continue } node := store.GetNode(tx, decision.new.NodeID) if node == nil || node.Meta.Version != nodeInfo.Meta.Version { // node is out of date failed = append(failed, decision) continue } } if err := store.UpdateTask(tx, decision.new); err != nil { log.G(ctx).Debugf("scheduler failed to update task %s; will retry", taskID) failed = append(failed, decision) continue } successful = append(successful, decision) return nil } return nil }) if err != nil { return err } } return nil }) if err != nil { log.G(ctx).WithError(err).Error("scheduler tick transaction failed") failed = append(failed, successful[applied:]...) successful = successful[:applied] } return }
// signNodeCert does the bulk of the work for signing a certificate func (s *Server) signNodeCert(ctx context.Context, node *api.Node) { if !s.securityConfig.RootCA().CanSign() { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "method": "(*Server).signNodeCert", }).Errorf("no valid signer found") return } node = node.Copy() nodeID := node.ID // Convert the role from proto format role, err := ParseRole(node.Certificate.Role) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("failed to parse role") return } // Attempt to sign the CSR cert, err := s.securityConfig.RootCA().ParseValidateAndSignCSR(node.Certificate.CSR, node.Certificate.CN, role, s.securityConfig.ClientTLSCreds.Organization()) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("failed to sign CSR") // If this error is due the lack of signer, maybe some other // manager in the future will pick it up. Return without // changing the state of the certificate. if err == ErrNoValidSigner { return } // If the current state is already Failed, no need to change it if node.Certificate.Status.State == api.IssuanceStateFailed { return } // We failed to sign this CSR, change the state to FAILED err = s.store.Update(func(tx store.Tx) error { node := store.GetNode(tx, nodeID) if node == nil { return fmt.Errorf("node %s not found", nodeID) } node.Certificate.Status = api.IssuanceStatus{ State: api.IssuanceStateFailed, Err: err.Error(), } return store.UpdateNode(tx, node) }) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": nodeID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("transaction failed when setting state to FAILED") } return } // We were able to successfully sign the new CSR. Let's try to update the nodeStore for { err = s.store.Update(func(tx store.Tx) error { // Remote nodes are expecting a full certificate chain, not just a signed certificate node.Certificate.Certificate = append(cert, s.securityConfig.RootCA().Cert...) node.Certificate.Status = api.IssuanceStatus{ State: api.IssuanceStateIssued, } err := store.UpdateNode(tx, node) if err != nil { node = store.GetNode(tx, nodeID) if node == nil { err = fmt.Errorf("node %s does not exist", nodeID) } } return err }) if err == nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "node.role": node.Certificate.Role, "method": "(*Server).signNodeCert", }).Debugf("certificate issued") break } if err == store.ErrSequenceConflict { continue } log.G(ctx).WithFields(logrus.Fields{ "node.id": nodeID, "method": "(*Server).signNodeCert", }).WithError(err).Errorf("transaction failed") return } }
func TestRaftForceNewCluster(t *testing.T) { t.Parallel() nodes, clockSource := raftutils.NewRaftCluster(t, tc) // Propose a value values := make([]*api.Node, 2) var err error values[0], err = raftutils.ProposeValue(t, nodes[1], "id1") assert.NoError(t, err, "failed to propose value") // The memberlist should contain 3 members on each node for i := 1; i <= 3; i++ { assert.Equal(t, len(nodes[uint64(i)].GetMemberlist()), 3) } // Stop all nodes for _, node := range nodes { node.Server.Stop() node.Shutdown() } raftutils.AdvanceTicks(clockSource, 5) toClean := map[uint64]*raftutils.TestNode{ 2: nodes[2], 3: nodes[3], } raftutils.TeardownCluster(t, toClean) delete(nodes, 2) delete(nodes, 3) // Only restart the first node with force-new-cluster option nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], true) raftutils.WaitForCluster(t, clockSource, nodes) // The memberlist should contain only one node (self) assert.Equal(t, len(nodes[1].GetMemberlist()), 1) // Add 2 more members nodes[2] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc) raftutils.WaitForCluster(t, clockSource, nodes) nodes[3] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc) raftutils.WaitForCluster(t, clockSource, nodes) newCluster := map[uint64]*raftutils.TestNode{ 1: nodes[1], 2: nodes[2], 3: nodes[3], } defer raftutils.TeardownCluster(t, newCluster) // The memberlist should contain 3 members on each node for i := 1; i <= 3; i++ { assert.Equal(t, len(nodes[uint64(i)].GetMemberlist()), 3) } // Propose another value values[1], err = raftutils.ProposeValue(t, raftutils.Leader(nodes), "id2") assert.NoError(t, err, "failed to propose value") for _, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { var err error node.MemoryStore().View(func(tx store.ReadTx) { var allNodes []*api.Node allNodes, err = store.FindNodes(tx, store.All) if err != nil { return } if len(allNodes) != 2 { err = fmt.Errorf("expected 2 nodes, got %d", len(allNodes)) return } for i, nodeID := range []string{"id1", "id2"} { n := store.GetNode(tx, nodeID) if !reflect.DeepEqual(n, values[i]) { err = fmt.Errorf("node %s did not match expected value", nodeID) return } } }) return err })) } }
// register is used for registration of node with particular dispatcher. func (d *Dispatcher) register(ctx context.Context, nodeID string, description *api.NodeDescription) (string, error) { // prevent register until we're ready to accept it if err := d.isRunningLocked(); err != nil { return "", err } if err := d.nodes.CheckRateLimit(nodeID); err != nil { return "", err } // TODO(stevvooe): Validate node specification. var node *api.Node d.store.View(func(tx store.ReadTx) { node = store.GetNode(tx, nodeID) }) if node == nil { return "", ErrNodeNotFound } d.nodeUpdatesLock.Lock() d.nodeUpdates[nodeID] = nodeUpdate{status: &api.NodeStatus{State: api.NodeStatus_READY}, description: description} numUpdates := len(d.nodeUpdates) d.nodeUpdatesLock.Unlock() if numUpdates >= maxBatchItems { select { case d.processUpdatesTrigger <- struct{}{}: case <-d.ctx.Done(): return "", d.ctx.Err() } } // Wait until the node update batch happens before unblocking register. d.processUpdatesLock.Lock() select { case <-d.ctx.Done(): return "", d.ctx.Err() default: } d.processUpdatesCond.Wait() d.processUpdatesLock.Unlock() expireFunc := func() { nodeStatus := api.NodeStatus{State: api.NodeStatus_DOWN, Message: "heartbeat failure"} log.G(ctx).Debugf("heartbeat expiration") if err := d.nodeRemove(nodeID, nodeStatus); err != nil { log.G(ctx).WithError(err).Errorf("failed deregistering node after heartbeat expiration") } } rn := d.nodes.Add(node, expireFunc) // NOTE(stevvooe): We need be a little careful with re-registration. The // current implementation just matches the node id and then gives away the // sessionID. If we ever want to use sessionID as a secret, which we may // want to, this is giving away the keys to the kitchen. // // The right behavior is going to be informed by identity. Basically, each // time a node registers, we invalidate the session and issue a new // session, once identity is proven. This will cause misbehaved agents to // be kicked when multiple connections are made. return rn.SessionID, nil }
// NodeCertificateStatus returns the current issuance status of an issuance request identified by the nodeID func (s *Server) NodeCertificateStatus(ctx context.Context, request *api.NodeCertificateStatusRequest) (*api.NodeCertificateStatusResponse, error) { if request.NodeID == "" { return nil, grpc.Errorf(codes.InvalidArgument, codes.InvalidArgument.String()) } if err := s.addTask(); err != nil { return nil, err } defer s.doneTask() var node *api.Node event := state.EventUpdateNode{ Node: &api.Node{ID: request.NodeID}, Checks: []state.NodeCheckFunc{state.NodeCheckID}, } // Retrieve the current value of the certificate with this token, and create a watcher updates, cancel, err := store.ViewAndWatch( s.store, func(tx store.ReadTx) error { node = store.GetNode(tx, request.NodeID) return nil }, event, ) if err != nil { return nil, err } defer cancel() // This node ID doesn't exist if node == nil { return nil, grpc.Errorf(codes.NotFound, codes.NotFound.String()) } log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "status": node.Certificate.Status, "method": "NodeCertificateStatus", }) // If this certificate has a final state, return it immediately (both pending and renew are transition states) if isFinalState(node.Certificate.Status) { return &api.NodeCertificateStatusResponse{ Status: &node.Certificate.Status, Certificate: &node.Certificate, }, nil } log.G(ctx).WithFields(logrus.Fields{ "node.id": node.ID, "status": node.Certificate.Status, "method": "NodeCertificateStatus", }).Debugf("started watching for certificate updates") // Certificate is Pending or in an Unknown state, let's wait for changes. for { select { case event := <-updates: switch v := event.(type) { case state.EventUpdateNode: // We got an update on the certificate record. If the status is a final state, // return the certificate. if isFinalState(v.Node.Certificate.Status) { cert := v.Node.Certificate.Copy() return &api.NodeCertificateStatusResponse{ Status: &cert.Status, Certificate: cert, }, nil } } case <-ctx.Done(): return nil, ctx.Err() case <-s.ctx.Done(): return nil, s.ctx.Err() } } }
func (d *Dispatcher) processUpdates(ctx context.Context) { var ( taskUpdates map[string]*api.TaskStatus nodeUpdates map[string]nodeUpdate ) d.taskUpdatesLock.Lock() if len(d.taskUpdates) != 0 { taskUpdates = d.taskUpdates d.taskUpdates = make(map[string]*api.TaskStatus) } d.taskUpdatesLock.Unlock() d.nodeUpdatesLock.Lock() if len(d.nodeUpdates) != 0 { nodeUpdates = d.nodeUpdates d.nodeUpdates = make(map[string]nodeUpdate) } d.nodeUpdatesLock.Unlock() if len(taskUpdates) == 0 && len(nodeUpdates) == 0 { return } log := log.G(ctx).WithFields(logrus.Fields{ "method": "(*Dispatcher).processUpdates", }) _, err := d.store.Batch(func(batch *store.Batch) error { for taskID, status := range taskUpdates { err := batch.Update(func(tx store.Tx) error { logger := log.WithField("task.id", taskID) task := store.GetTask(tx, taskID) if task == nil { logger.Errorf("task unavailable") return nil } logger = logger.WithField("state.transition", fmt.Sprintf("%v->%v", task.Status.State, status.State)) if task.Status == *status { logger.Debug("task status identical, ignoring") return nil } if task.Status.State > status.State { logger.Debug("task status invalid transition") return nil } task.Status = *status if err := store.UpdateTask(tx, task); err != nil { logger.WithError(err).Error("failed to update task status") return nil } logger.Debug("task status updated") return nil }) if err != nil { log.WithError(err).Error("dispatcher task update transaction failed") } } for nodeID, nodeUpdate := range nodeUpdates { err := batch.Update(func(tx store.Tx) error { logger := log.WithField("node.id", nodeID) node := store.GetNode(tx, nodeID) if node == nil { logger.Errorf("node unavailable") return nil } if nodeUpdate.status != nil { node.Status.State = nodeUpdate.status.State node.Status.Message = nodeUpdate.status.Message if nodeUpdate.status.Addr != "" { node.Status.Addr = nodeUpdate.status.Addr } } if nodeUpdate.description != nil { node.Description = nodeUpdate.description } if err := store.UpdateNode(tx, node); err != nil { logger.WithError(err).Error("failed to update node status") return nil } logger.Debug("node status updated") return nil }) if err != nil { log.WithError(err).Error("dispatcher node update transaction failed") } } return nil }) if err != nil { log.WithError(err).Error("dispatcher batch failed") } d.processUpdatesCond.Broadcast() }
func (d *Dispatcher) markNodesUnknown(ctx context.Context) error { log := log.G(ctx).WithField("method", "(*Dispatcher).markNodesUnknown") var nodes []*api.Node var err error d.store.View(func(tx store.ReadTx) { nodes, err = store.FindNodes(tx, store.All) }) if err != nil { return errors.Wrap(err, "failed to get list of nodes") } _, err = d.store.Batch(func(batch *store.Batch) error { for _, n := range nodes { err := batch.Update(func(tx store.Tx) error { // check if node is still here node := store.GetNode(tx, n.ID) if node == nil { return nil } // do not try to resurrect down nodes if node.Status.State == api.NodeStatus_DOWN { nodeCopy := node expireFunc := func() { if err := d.moveTasksToOrphaned(nodeCopy.ID); err != nil { log.WithError(err).Error(`failed to move all tasks to "ORPHANED" state`) } d.downNodes.Delete(nodeCopy.ID) } d.downNodes.Add(nodeCopy, expireFunc) return nil } node.Status.State = api.NodeStatus_UNKNOWN node.Status.Message = `Node moved to "unknown" state due to leadership change in cluster` nodeID := node.ID expireFunc := func() { log := log.WithField("node", nodeID) log.Debugf("heartbeat expiration for unknown node") if err := d.markNodeNotReady(nodeID, api.NodeStatus_DOWN, `heartbeat failure for node in "unknown" state`); err != nil { log.WithError(err).Errorf(`failed deregistering node after heartbeat expiration for node in "unknown" state`) } } if err := d.nodes.AddUnknown(node, expireFunc); err != nil { return errors.Wrap(err, `adding node in "unknown" state to node store failed`) } if err := store.UpdateNode(tx, node); err != nil { return errors.Wrap(err, "update failed") } return nil }) if err != nil { log.WithField("node", n.ID).WithError(err).Errorf(`failed to move node to "unknown" state`) } } return nil }) return err }