func TestDemoteDownedManager(t *testing.T) { numWorker, numManager := 0, 3 cl := newCluster(t, numWorker, numManager) defer func() { require.NoError(t, cl.Stop()) }() leader, err := cl.Leader() require.NoError(t, err) // add a new manager so we have 3, then find one (not the leader) to demote var demotee *testNode for _, n := range cl.nodes { if n.IsManager() && n.node.NodeID() != leader.node.NodeID() { demotee = n break } } nodeID := demotee.node.NodeID() resp, err := cl.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodeID}) require.NoError(t, err) spec := resp.Node.Spec.Copy() spec.Role = api.NodeRoleWorker // stop the node, then demote it, and start it back up again so when it comes back up it has to realize // it's not running anymore require.NoError(t, demotee.Pause()) // demote node, but don't use SetNodeRole, which waits until it successfully becomes a worker, since // the node is currently down require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { _, err := cl.api.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: nodeID, Spec: spec, NodeVersion: &resp.Node.Meta.Version, }) return err }, opsTimeout)) // start it back up again require.NoError(t, cl.StartNode(nodeID)) // wait to become worker require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { if demotee.IsManager() { return fmt.Errorf("node is still not a worker") } return nil }, opsTimeout)) // agents 1, managers 2 numWorker++ numManager-- pollClusterReady(t, cl, numWorker, numManager) }
// SetNodeRole sets role for node through control api. func (c *testCluster) SetNodeRole(id string, role api.NodeRole) error { node, ok := c.nodes[id] if !ok { return fmt.Errorf("set node role: node %s not found", id) } if node.IsManager() && role == api.NodeRoleManager { return fmt.Errorf("node is already manager") } if !node.IsManager() && role == api.NodeRoleWorker { return fmt.Errorf("node is already worker") } var initialTimeout time.Duration // version might change between get and update, so retry for i := 0; i < 5; i++ { time.Sleep(initialTimeout) initialTimeout += 500 * time.Millisecond resp, err := c.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: id}) if err != nil { return err } spec := resp.Node.Spec.Copy() spec.Role = role if _, err := c.api.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: id, Spec: spec, NodeVersion: &resp.Node.Meta.Version, }); err != nil { // there possible problems on calling update node because redirecting // node or leader might want to shut down if grpc.ErrorDesc(err) == "update out of sequence" { continue } return err } if role == api.NodeRoleManager { // wait to become manager return raftutils.PollFuncWithTimeout(nil, func() error { if !node.IsManager() { return fmt.Errorf("node is still not a manager") } return nil }, opsTimeout) } // wait to become worker return raftutils.PollFuncWithTimeout(nil, func() error { if node.IsManager() { return fmt.Errorf("node is still not a worker") } return nil }, opsTimeout) } return fmt.Errorf("set role %s for node %s, got sequence error 5 times", role, id) }
// RemoveNode removes node entirely. It tries to demote managers. func (c *testCluster) RemoveNode(id string, graceful bool) error { node, ok := c.nodes[id] if !ok { return fmt.Errorf("remove node: node %s not found", id) } // demote before removal if node.IsManager() { if err := c.SetNodeRole(id, api.NodeRoleWorker); err != nil { return fmt.Errorf("demote manager: %v", err) } } if err := node.Stop(); err != nil { return err } delete(c.nodes, id) if graceful { if err := raftutils.PollFuncWithTimeout(nil, func() error { resp, err := c.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: id}) if err != nil { return fmt.Errorf("get node: %v", err) } if resp.Node.Status.State != api.NodeStatus_DOWN { return fmt.Errorf("node %s is still not down", id) } return nil }, opsTimeout); err != nil { return err } } if _, err := c.api.RemoveNode(context.Background(), &api.RemoveNodeRequest{NodeID: id, Force: !graceful}); err != nil { return fmt.Errorf("remove node: %v", err) } return nil }
// pollClusterReady calls control api until all conditions are true: // * all nodes are ready // * all managers has membership == accepted // * all managers has reachability == reachable // * one node is leader // * number of workers and managers equals to expected func pollClusterReady(t *testing.T, c *testCluster, numWorker, numManager int) { pollFunc := func() error { res, err := c.api.ListNodes(context.Background(), &api.ListNodesRequest{}) if err != nil { return err } var mCount int var leaderFound bool for _, n := range res.Nodes { if n.Status.State != api.NodeStatus_READY { return fmt.Errorf("node %s with desired role %s isn't ready, status %s, message %s", n.ID, n.Spec.DesiredRole, n.Status.State, n.Status.Message) } if n.Spec.Membership != api.NodeMembershipAccepted { return fmt.Errorf("node %s with desired role %s isn't accepted to cluster, membership %s", n.ID, n.Spec.DesiredRole, n.Spec.Membership) } if n.Certificate.Role != n.Spec.DesiredRole { return fmt.Errorf("node %s had different roles in spec and certificate, %s and %s respectively", n.ID, n.Spec.DesiredRole, n.Certificate.Role) } if n.Certificate.Status.State != api.IssuanceStateIssued { return fmt.Errorf("node %s with desired role %s has no issued certificate, issuance state %s", n.ID, n.Spec.DesiredRole, n.Certificate.Status.State) } if n.Role == api.NodeRoleManager { if n.ManagerStatus == nil { return fmt.Errorf("manager node %s has no ManagerStatus field", n.ID) } if n.ManagerStatus.Reachability != api.RaftMemberStatus_REACHABLE { return fmt.Errorf("manager node %s has reachable status: %s", n.ID, n.ManagerStatus.Reachability) } mCount++ if n.ManagerStatus.Leader { leaderFound = true } } else { if n.ManagerStatus != nil { return fmt.Errorf("worker node %s should not have manager status, returned %s", n.ID, n.ManagerStatus) } } } if !leaderFound { return fmt.Errorf("leader of cluster is not found") } wCount := len(res.Nodes) - mCount if mCount != numManager { return fmt.Errorf("unexpected number of managers: %d, expected %d", mCount, numManager) } if wCount != numWorker { return fmt.Errorf("unexpected number of workers: %d, expected %d", wCount, numWorker) } return nil } err := raftutils.PollFuncWithTimeout(nil, pollFunc, opsTimeout) require.NoError(t, err) }
func TestClusterStorePasshphraseRotationForRootCA(t *testing.T) { if !*integrationTests { t.Skip("integration test") } os.Setenv(ca.PassphraseENVVar, "password1") defer os.Setenv(ca.PassphraseENVVar, "") defer os.Setenv(ca.PassphraseENVVarPrev, "") mCount, aCount := 5, 15 c := createManagersCluster(t, mCount, aCount) require.NoError(t, testutils.PollFunc(nil, c.pollRegister)) // Get the leader leader, err := c.leader() assert.NoError(t, err) // check key material in store var clusters []*api.Cluster leader.m.RaftNode.MemoryStore().View(func(tx store.ReadTx) { clusters, err = store.FindClusters(tx, store.All) }) assert.NoError(t, err) assert.Len(t, clusters, 1, "there should be one cluster") assert.NotNil(t, clusters[0].RootCA.CACert) assert.NotNil(t, clusters[0].RootCA.CAKey) assert.Contains(t, string(clusters[0].RootCA.CAKey), "Proc-Type: 4,ENCRYPTED") firstEncryptedKey := clusters[0].RootCA.CAKey // Set an ENV passphrase and kill the current leader os.Setenv(ca.PassphraseENVVarPrev, "password1") os.Setenv(ca.PassphraseENVVar, "password2") require.NoError(t, c.destroyLeader()) // ensure that cluster will converge to expected number of agents, we need big timeout because of heartbeat times require.NoError(t, testutils.PollFuncWithTimeout(nil, c.pollRegister, 30*time.Second)) // Get the new leader leader, err = c.leader() assert.NoError(t, err) // check key material in store leader.m.RaftNode.MemoryStore().View(func(tx store.ReadTx) { clusters, err = store.FindClusters(tx, store.All) }) assert.NoError(t, err) assert.Len(t, clusters, 1, "there should be one cluster") assert.NotNil(t, clusters[0].RootCA.CACert) assert.NotNil(t, clusters[0].RootCA.CAKey) assert.Contains(t, string(clusters[0].RootCA.CAKey), "Proc-Type: 4,ENCRYPTED") assert.NotEqual(t, firstEncryptedKey, clusters[0].RootCA.CAKey) }
func TestGetUnlockKey(t *testing.T) { t.Parallel() tc := testutils.NewTestCA(t) defer tc.Stop() var cluster *api.Cluster tc.MemoryStore.View(func(tx store.ReadTx) { clusters, err := store.FindClusters(tx, store.ByName(store.DefaultClusterName)) require.NoError(t, err) cluster = clusters[0] }) resp, err := tc.CAClients[0].GetUnlockKey(context.Background(), &api.GetUnlockKeyRequest{}) require.NoError(t, err) require.Nil(t, resp.UnlockKey) require.Equal(t, cluster.Meta.Version, resp.Version) // Update the unlock key require.NoError(t, tc.MemoryStore.Update(func(tx store.Tx) error { cluster = store.GetCluster(tx, cluster.ID) cluster.Spec.EncryptionConfig.AutoLockManagers = true cluster.UnlockKeys = []*api.EncryptionKey{{ Subsystem: ca.ManagerRole, Key: []byte("secret"), }} return store.UpdateCluster(tx, cluster) })) tc.MemoryStore.View(func(tx store.ReadTx) { cluster = store.GetCluster(tx, cluster.ID) }) require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { resp, err = tc.CAClients[0].GetUnlockKey(context.Background(), &api.GetUnlockKeyRequest{}) if err != nil { return fmt.Errorf("get unlock key: %v", err) } if !bytes.Equal(resp.UnlockKey, []byte("secret")) { return fmt.Errorf("secret hasn't rotated yet") } if cluster.Meta.Version.Index > resp.Version.Index { return fmt.Errorf("hasn't updated to the right version yet") } return nil }, 250*time.Millisecond)) }
func pollServiceReady(t *testing.T, c *testCluster, sid string) { pollFunc := func() error { req := &api.ListTasksRequest{} res, err := c.api.ListTasks(context.Background(), req) require.NoError(t, err) if len(res.Tasks) == 0 { return fmt.Errorf("tasks list is empty") } for _, task := range res.Tasks { if task.Status.State != api.TaskStateRunning { return fmt.Errorf("task %s is not running, status %s", task.ID, task.Status.State) } } return nil } require.NoError(t, raftutils.PollFuncWithTimeout(nil, pollFunc, opsTimeout)) }
// ControlClient returns grpc client to ControlAPI of node. It will panic for // non-manager nodes. func (n *testNode) ControlClient(ctx context.Context) (api.ControlClient, error) { ctx, cancel := context.WithTimeout(ctx, opsTimeout) defer cancel() connChan := n.node.ListenControlSocket(ctx) var controlConn *grpc.ClientConn if err := raftutils.PollFuncWithTimeout(nil, func() error { select { case controlConn = <-connChan: default: } if controlConn == nil { return fmt.Errorf("didn't get control api connection") } return nil }, opsTimeout); err != nil { return nil, err } return api.NewControlClient(controlConn), nil }
func TestRaftWipedState(t *testing.T) { t.Parallel() nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // Stop node 3 nodes[3].Server.Stop() nodes[3].ShutdownRaft() // Remove its state os.RemoveAll(nodes[3].StateDir) raftutils.AdvanceTicks(clockSource, 5) // Restart node 3 nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false) // Make sure this doesn't panic. raftutils.PollFuncWithTimeout(clockSource, func() error { return errors.New("keep the poll going") }, time.Second) }
func TestClusterReelection(t *testing.T) { if !*integrationTests { t.Skip("integration test") } mCount, aCount := 5, 15 c := createManagersCluster(t, mCount, aCount) require.NoError(t, testutils.PollFunc(nil, c.pollRegister)) require.NoError(t, c.destroyLeader()) // let's down some managers in the meantime require.NoError(t, c.destroyAgents(5)) // ensure that cluster will converge to expected number of agents, we need big timeout because of heartbeat times require.NoError(t, testutils.PollFuncWithTimeout(nil, c.pollRegister, 30*time.Second)) leader, err := c.leader() assert.NoError(t, err) // check nodes in store var nodes []*api.Node leader.m.RaftNode.MemoryStore().View(func(tx store.ReadTx) { ns, err := store.FindNodes(tx, store.All) assert.NoError(t, err) for _, n := range ns { if n.Spec.Role == api.NodeRoleWorker { nodes = append(nodes, n) } } }) assert.NoError(t, err) assert.Len(t, nodes, aCount, "there should be all nodes in store") var downAgentsCount int for _, node := range nodes { if node.Status.State == api.NodeStatus_DOWN { downAgentsCount++ continue } assert.Equal(t, api.NodeStatus_READY, node.Status.State, "there should be only down and ready nodes at this point") } assert.Equal(t, 5, downAgentsCount, "unexpected number of down agents") }
func TestRestartLeader(t *testing.T) { t.Parallel() numWorker, numManager := 5, 3 cl := newCluster(t, numWorker, numManager) defer func() { require.NoError(t, cl.Stop()) }() leader, err := cl.Leader() require.NoError(t, err) origLeaderID := leader.node.NodeID() require.NoError(t, leader.Pause(false)) require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{}) if err != nil { return err } for _, node := range resp.Nodes { if node.ID == origLeaderID { continue } require.False(t, node.Status.State == api.NodeStatus_DOWN, "nodes shouldn't go to down") if node.Status.State != api.NodeStatus_READY { return errors.Errorf("node %s is still not ready", node.ID) } } return nil }, opsTimeout)) require.NoError(t, cl.StartNode(origLeaderID)) pollClusterReady(t, cl, numWorker, numManager) }
// Tests locking and unlocking the manager and key rotations func TestManagerLockUnlock(t *testing.T) { ctx := context.Background() temp, err := ioutil.TempFile("", "test-manager-lock") require.NoError(t, err) require.NoError(t, temp.Close()) require.NoError(t, os.Remove(temp.Name())) defer os.RemoveAll(temp.Name()) stateDir, err := ioutil.TempDir("", "test-raft") require.NoError(t, err) defer os.RemoveAll(stateDir) tc := testutils.NewTestCA(t) defer tc.Stop() managerSecurityConfig, err := tc.NewNodeConfig(ca.ManagerRole) require.NoError(t, err) _, _, err = managerSecurityConfig.KeyReader().Read() require.NoError(t, err) m, err := New(&Config{ RemoteAPI: RemoteAddrs{ListenAddr: "127.0.0.1:0"}, ControlAPI: temp.Name(), StateDir: stateDir, SecurityConfig: managerSecurityConfig, // start off without any encryption }) require.NoError(t, err) require.NotNil(t, m) done := make(chan error) defer close(done) go func() { done <- m.Run(ctx) }() opts := []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(managerSecurityConfig.ClientTLSCreds), } conn, err := grpc.Dial(m.Addr(), opts...) require.NoError(t, err) defer func() { require.NoError(t, conn.Close()) }() // check that there is no kek currently - we are using the API because this // lets us wait until the manager is up and listening, as well var cluster *api.Cluster client := api.NewControlClient(conn) require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { resp, err := client.ListClusters(ctx, &api.ListClustersRequest{}) if err != nil { return err } if len(resp.Clusters) == 0 { return fmt.Errorf("no clusters yet") } cluster = resp.Clusters[0] return nil }, 1*time.Second)) require.Nil(t, cluster.UnlockKeys) // tls key is unencrypted, but there is a DEK key, err := ioutil.ReadFile(tc.Paths.Node.Key) require.NoError(t, err) keyBlock, _ := pem.Decode(key) require.NotNil(t, keyBlock) require.False(t, x509.IsEncryptedPEMBlock(keyBlock)) require.Len(t, keyBlock.Headers, 2) currentDEK, err := decodePEMHeaderValue(keyBlock.Headers[pemHeaderRaftDEK], nil) require.NoError(t, err) require.NotEmpty(t, currentDEK) // update the lock key - this may fail due to update out of sequence errors, so try again for { getResp, err := client.GetCluster(ctx, &api.GetClusterRequest{ClusterID: cluster.ID}) require.NoError(t, err) cluster = getResp.Cluster spec := cluster.Spec.Copy() spec.EncryptionConfig.AutoLockManagers = true updateResp, err := client.UpdateCluster(ctx, &api.UpdateClusterRequest{ ClusterID: cluster.ID, ClusterVersion: &cluster.Meta.Version, Spec: spec, }) if grpc.ErrorDesc(err) == "update out of sequence" { continue } // if there is any other type of error, this should fail if err == nil { cluster = updateResp.Cluster } break } require.NoError(t, err) caConn := api.NewCAClient(conn) unlockKeyResp, err := caConn.GetUnlockKey(ctx, &api.GetUnlockKeyRequest{}) require.NoError(t, err) // this should update the TLS key, rotate the DEK, and finish snapshotting var updatedKey []byte require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { updatedKey, err = ioutil.ReadFile(tc.Paths.Node.Key) require.NoError(t, err) // this should never error due to atomic writes if bytes.Equal(key, updatedKey) { return fmt.Errorf("TLS key should have been re-encrypted at least") } keyBlock, _ = pem.Decode(updatedKey) require.NotNil(t, keyBlock) // this should never error due to atomic writes if !x509.IsEncryptedPEMBlock(keyBlock) { return fmt.Errorf("Key not encrypted") } // we don't check that the TLS key has been rotated, because that may take // a little bit, and is best effort only currentDEKString, ok := keyBlock.Headers[pemHeaderRaftDEK] require.True(t, ok) // there should never NOT be a current header nowCurrentDEK, err := decodePEMHeaderValue(currentDEKString, unlockKeyResp.UnlockKey) require.NoError(t, err) // it should always be encrypted if bytes.Equal(currentDEK, nowCurrentDEK) { return fmt.Errorf("snapshot has not been finished yet") } currentDEK = nowCurrentDEK return nil }, 1*time.Second)) _, ok := keyBlock.Headers[pemHeaderRaftPendingDEK] require.False(t, ok) // once the snapshot is do _, ok = keyBlock.Headers[pemHeaderRaftDEKNeedsRotation] require.False(t, ok) // verify that the snapshot is readable with the new DEK encrypter, decrypter := encryption.Defaults(currentDEK) // we can't use the raftLogger, because the WALs are still locked while the raft node is up. And once we remove // the manager, they'll be deleted. snapshot, err := storage.NewSnapFactory(encrypter, decrypter).New(filepath.Join(stateDir, "raft", "snap-v3-encrypted")).Load() require.NoError(t, err) require.NotNil(t, snapshot) // update the lock key to nil for i := 0; i < 3; i++ { getResp, err := client.GetCluster(ctx, &api.GetClusterRequest{ClusterID: cluster.ID}) require.NoError(t, err) cluster = getResp.Cluster spec := cluster.Spec.Copy() spec.EncryptionConfig.AutoLockManagers = false _, err = client.UpdateCluster(ctx, &api.UpdateClusterRequest{ ClusterID: cluster.ID, ClusterVersion: &cluster.Meta.Version, Spec: spec, }) if grpc.ErrorDesc(err) == "update out of sequence" { continue } require.NoError(t, err) } // this should update the TLS key var unlockedKey []byte require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { unlockedKey, err = ioutil.ReadFile(tc.Paths.Node.Key) if err != nil { return err } if bytes.Equal(unlockedKey, updatedKey) { return fmt.Errorf("TLS key should have been rotated") } return nil }, 1*time.Second)) // the new key should not be encrypted, and the DEK should also be unencrypted // but not rotated keyBlock, _ = pem.Decode(unlockedKey) require.NotNil(t, keyBlock) require.False(t, x509.IsEncryptedPEMBlock(keyBlock)) unencryptedDEK, err := decodePEMHeaderValue(keyBlock.Headers[pemHeaderRaftDEK], nil) require.NoError(t, err) require.NotNil(t, unencryptedDEK) require.Equal(t, currentDEK, unencryptedDEK) m.Stop(ctx) // After stopping we should MAY receive an error from ListenAndServe if // all this happened before WaitForLeader completed, so don't check the // error. <-done }
func startDispatcher(c *Config) (*grpcDispatcher, error) { l, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { return nil, err } tca := testutils.NewTestCA(nil, testutils.AcceptancePolicy(true, true, "")) agentSecurityConfig1, err := tca.NewNodeConfig(ca.AgentRole) if err != nil { return nil, err } agentSecurityConfig2, err := tca.NewNodeConfig(ca.AgentRole) if err != nil { return nil, err } managerSecurityConfig, err := tca.NewNodeConfig(ca.ManagerRole) if err != nil { return nil, err } serverOpts := []grpc.ServerOption{grpc.Creds(managerSecurityConfig.ServerTLSCreds)} s := grpc.NewServer(serverOpts...) tc := &testCluster{addr: l.Addr().String(), store: tca.MemoryStore} d := New(tc, c) authorize := func(ctx context.Context, roles []string) error { _, err := ca.AuthorizeForwardedRoleAndOrg(ctx, roles, []string{ca.ManagerRole}, tca.Organization) return err } authenticatedDispatcherAPI := api.NewAuthenticatedWrapperDispatcherServer(d, authorize) api.RegisterDispatcherServer(s, authenticatedDispatcherAPI) go func() { // Serve will always return an error (even when properly stopped). // Explicitly ignore it. _ = s.Serve(l) }() go d.Run(context.Background()) if err := raftutils.PollFuncWithTimeout(nil, func() error { d.mu.Lock() defer d.mu.Unlock() if !d.isRunning() { return fmt.Errorf("dispatcher is not running") } return nil }, 5*time.Second); err != nil { return nil, err } clientOpts := []grpc.DialOption{grpc.WithTimeout(10 * time.Second)} clientOpts1 := append(clientOpts, grpc.WithTransportCredentials(agentSecurityConfig1.ClientTLSCreds)) clientOpts2 := append(clientOpts, grpc.WithTransportCredentials(agentSecurityConfig2.ClientTLSCreds)) clientOpts3 := append(clientOpts, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true}))) conn1, err := grpc.Dial(l.Addr().String(), clientOpts1...) if err != nil { return nil, err } conn2, err := grpc.Dial(l.Addr().String(), clientOpts2...) if err != nil { return nil, err } conn3, err := grpc.Dial(l.Addr().String(), clientOpts3...) if err != nil { return nil, err } clients := []api.DispatcherClient{api.NewDispatcherClient(conn1), api.NewDispatcherClient(conn2), api.NewDispatcherClient(conn3)} securityConfigs := []*ca.SecurityConfig{agentSecurityConfig1, agentSecurityConfig2, managerSecurityConfig} conns := []*grpc.ClientConn{conn1, conn2, conn3} return &grpcDispatcher{ Clients: clients, SecurityConfigs: securityConfigs, Store: tc.MemoryStore(), dispatcherServer: d, conns: conns, grpcServer: s, testCA: tca, }, nil }
func TestForceNewCluster(t *testing.T) { t.Parallel() // create an external CA so that we can use it to generate expired certificates tempDir, err := ioutil.TempDir("", "external-ca") require.NoError(t, err) defer os.RemoveAll(tempDir) rootCA, err := ca.CreateRootCA("externalRoot", ca.NewConfigPaths(tempDir).RootCA) require.NoError(t, err) // start a new cluster with the external CA bootstrapped numWorker, numManager := 0, 1 cl := newTestCluster() defer func() { require.NoError(t, cl.Stop()) }() require.NoError(t, cl.AddManager(false, &rootCA), "manager number 1") pollClusterReady(t, cl, numWorker, numManager) leader, err := cl.Leader() require.NoError(t, err) sid, err := cl.CreateService("test_service", 2) require.NoError(t, err) pollServiceReady(t, cl, sid) // generate an expired certificate rootKey, err := helpers.ParsePrivateKeyPEM(rootCA.Key) require.NoError(t, err) rootCert, err := helpers.ParseCertificatePEM(rootCA.Cert) require.NoError(t, err) managerCertFile := filepath.Join(leader.stateDir, "certificates", "swarm-node.crt") certBytes, err := ioutil.ReadFile(managerCertFile) require.NoError(t, err) managerCerts, err := helpers.ParseCertificatesPEM(certBytes) require.NoError(t, err) expiredCertTemplate := managerCerts[0] expiredCertTemplate.NotBefore = time.Now().Add(time.Hour * -5) expiredCertTemplate.NotAfter = time.Now().Add(time.Hour * -3) expiredCertDERBytes, err := x509.CreateCertificate(rand.Reader, expiredCertTemplate, rootCert, expiredCertTemplate.PublicKey, rootKey) require.NoError(t, err) expiredCertPEM := pem.EncodeToMemory(&pem.Block{ Type: "CERTIFICATE", Bytes: expiredCertDERBytes, }) // restart node with an expired certificate while forcing a new cluster - it should start without error and the certificate should be renewed nodeID := leader.node.NodeID() require.NoError(t, leader.Pause(true)) require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644)) require.NoError(t, cl.StartNode(nodeID)) pollClusterReady(t, cl, numWorker, numManager) pollServiceReady(t, cl, sid) err = raftutils.PollFuncWithTimeout(nil, func() error { certBytes, err := ioutil.ReadFile(managerCertFile) if err != nil { return err } managerCerts, err := helpers.ParseCertificatesPEM(certBytes) if err != nil { return err } if managerCerts[0].NotAfter.Before(time.Now()) { return errors.New("certificate hasn't been renewed yet") } return nil }, opsTimeout) require.NoError(t, err) // restart node with an expired certificate without forcing a new cluster - it should error on start require.NoError(t, leader.Pause(true)) require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644)) require.Error(t, cl.StartNode(nodeID)) }
func TestDemoteDownedManager(t *testing.T) { t.Parallel() numWorker, numManager := 0, 3 cl := newCluster(t, numWorker, numManager) defer func() { require.NoError(t, cl.Stop()) }() leader, err := cl.Leader() require.NoError(t, err) // Find a manager (not the leader) to demote. It must not be the third // manager we added, because there may not have been enough time for // that one to write anything to its WAL. var demotee *testNode for _, n := range cl.nodes { nodeID := n.node.NodeID() if n.IsManager() && nodeID != leader.node.NodeID() && cl.nodesOrder[nodeID] != 3 { demotee = n break } } nodeID := demotee.node.NodeID() resp, err := cl.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodeID}) require.NoError(t, err) spec := resp.Node.Spec.Copy() spec.DesiredRole = api.NodeRoleWorker // stop the node, then demote it, and start it back up again so when it comes back up it has to realize // it's not running anymore require.NoError(t, demotee.Pause(false)) // demote node, but don't use SetNodeRole, which waits until it successfully becomes a worker, since // the node is currently down require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { _, err := cl.api.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: nodeID, Spec: spec, NodeVersion: &resp.Node.Meta.Version, }) return err }, opsTimeout)) // start it back up again require.NoError(t, cl.StartNode(nodeID)) // wait to become worker require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { if demotee.IsManager() { return fmt.Errorf("node is still not a worker") } return nil }, opsTimeout)) // agents 1, managers 2 numWorker++ numManager-- pollClusterReady(t, cl, numWorker, numManager) }