func TestClusterStoreWithPasshphraseForRootCA(t *testing.T) { if !*integrationTests { t.Skip("integration test") } // Start with a passphrase from moment 0 os.Setenv(ca.PassphraseENVVar, "password1") defer os.Setenv(ca.PassphraseENVVar, "") defer os.Setenv(ca.PassphraseENVVarPrev, "") mCount, aCount := 5, 15 c := createManagersCluster(t, mCount, aCount) require.NoError(t, testutils.PollFunc(nil, c.pollRegister)) // Get the leader leader, err := c.leader() assert.NoError(t, err) // check key material in store var clusters []*api.Cluster leader.m.RaftNode.MemoryStore().View(func(tx store.ReadTx) { clusters, err = store.FindClusters(tx, store.All) }) assert.NoError(t, err) assert.Len(t, clusters, 1, "there should be one cluster") assert.NotNil(t, clusters[0].RootCA.CACert) assert.NotNil(t, clusters[0].RootCA.CAKey) assert.Contains(t, string(clusters[0].RootCA.CAKey), "Proc-Type: 4,ENCRYPTED") }
func testRaftRestartCluster(t *testing.T, stagger bool) { nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // Propose a value values := make([]*api.Node, 2) var err error values[0], err = raftutils.ProposeValue(t, nodes[1], "id1") assert.NoError(t, err, "failed to propose value") // Stop all nodes for _, node := range nodes { node.Server.Stop() node.Shutdown() } raftutils.AdvanceTicks(clockSource, 5) // Restart all nodes i := 0 for k, node := range nodes { if stagger && i != 0 { raftutils.AdvanceTicks(clockSource, 1) } nodes[k] = raftutils.RestartNode(t, clockSource, node, false) i++ } raftutils.WaitForCluster(t, clockSource, nodes) // Propose another value values[1], err = raftutils.ProposeValue(t, raftutils.Leader(nodes), "id2") assert.NoError(t, err, "failed to propose value") for _, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { var err error node.MemoryStore().View(func(tx store.ReadTx) { var allNodes []*api.Node allNodes, err = store.FindNodes(tx, store.All) if err != nil { return } if len(allNodes) != 2 { err = fmt.Errorf("expected 2 nodes, got %d", len(allNodes)) return } for i, nodeID := range []string{"id1", "id2"} { n := store.GetNode(tx, nodeID) if !reflect.DeepEqual(n, values[i]) { err = fmt.Errorf("node %s did not match expected value", nodeID) return } } }) return err })) } }
// This test rotates the encryption key and restarts the node - the intent is try to trigger // race conditions if there is more than one node and hence consensus may take longer. func TestRaftEncryptionKeyRotationStress(t *testing.T) { t.Parallel() // Bring up a 3 nodes cluster nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) leader := nodes[1] // constantly propose values done, stop, restart, clusterReady := make(chan struct{}), make(chan struct{}), make(chan struct{}), make(chan struct{}) go func() { counter := len(nodes) for { select { case <-stop: close(done) return case <-restart: // the node restarts may trigger a leadership change, so wait until the cluster has 3 // nodes again and a leader is selected before proposing more values <-clusterReady leader = raftutils.Leader(nodes) default: counter += 1 raftutils.ProposeValue(t, leader, DefaultProposalTime, fmt.Sprintf("id%d", counter)) } } }() for i := 0; i < 30; i++ { // rotate the encryption key nodes[3].KeyRotator.QueuePendingKey([]byte(fmt.Sprintf("newKey%d", i))) nodes[3].KeyRotator.RotationNotify() <- struct{}{} require.NoError(t, raftutils.PollFunc(clockSource, func() error { if nodes[3].KeyRotator.GetKeys().PendingDEK == nil { return nil } return fmt.Errorf("not done rotating yet") })) // restart the node and wait for everything to settle and a leader to be elected nodes[3].Server.Stop() nodes[3].ShutdownRaft() restart <- struct{}{} nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false) raftutils.AdvanceTicks(clockSource, 1) raftutils.WaitForCluster(t, clockSource, nodes) clusterReady <- struct{}{} } close(stop) <-done }
func TestCluster(t *testing.T) { if !*integrationTests { t.Skip("integration test") } c := createManagersCluster(t, 5, 15) defer c.Close() assert.NoError(t, testutils.PollFunc(nil, c.pollRegister)) m := c.ms[0] nCount := m.m.Dispatcher.NodeCount() assert.Equal(t, 15, nCount) }
func TestCanRemoveMember(t *testing.T) { nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // Stop node 2 and node 3 (2 nodes out of 3) nodes[2].Server.Stop() nodes[2].Shutdown() nodes[3].Server.Stop() nodes[3].Shutdown() // Node 2 and Node 3 should be listed as Unreachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[2].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 2 to be unreachable") } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 3 to be unreachable") } return nil })) // Removing node 3 should fail ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) err := nodes[1].RemoveMember(ctx, 3) assert.Error(t, err) assert.Equal(t, err, raft.ErrCannotRemoveMember) members := nodes[1].GetMemberlist() assert.Equal(t, len(members), 3) // Restart node 2 and node 3 nodes[2] = raftutils.RestartNode(t, clockSource, nodes[2], false) nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false) raftutils.WaitForCluster(t, clockSource, nodes) // Removing node 3 should succeed ctx, _ = context.WithTimeout(context.Background(), 10*time.Second) err = nodes[1].RemoveMember(ctx, nodes[3].Config.ID) assert.NoError(t, err) members = nodes[1].GetMemberlist() assert.Nil(t, members[nodes[3].Config.ID]) assert.Equal(t, len(members), 2) // Removing node 2 should fail ctx, _ = context.WithTimeout(context.Background(), 10*time.Second) err = nodes[1].RemoveMember(ctx, nodes[2].Config.ID) assert.Error(t, err) assert.Equal(t, err, raft.ErrCannotRemoveMember) assert.Equal(t, len(members), 2) }
func TestClusterStorePasshphraseRotationForRootCA(t *testing.T) { if !*integrationTests { t.Skip("integration test") } os.Setenv(ca.PassphraseENVVar, "password1") defer os.Setenv(ca.PassphraseENVVar, "") defer os.Setenv(ca.PassphraseENVVarPrev, "") mCount, aCount := 5, 15 c := createManagersCluster(t, mCount, aCount) require.NoError(t, testutils.PollFunc(nil, c.pollRegister)) // Get the leader leader, err := c.leader() assert.NoError(t, err) // check key material in store var clusters []*api.Cluster leader.m.RaftNode.MemoryStore().View(func(tx store.ReadTx) { clusters, err = store.FindClusters(tx, store.All) }) assert.NoError(t, err) assert.Len(t, clusters, 1, "there should be one cluster") assert.NotNil(t, clusters[0].RootCA.CACert) assert.NotNil(t, clusters[0].RootCA.CAKey) assert.Contains(t, string(clusters[0].RootCA.CAKey), "Proc-Type: 4,ENCRYPTED") firstEncryptedKey := clusters[0].RootCA.CAKey // Set an ENV passphrase and kill the current leader os.Setenv(ca.PassphraseENVVarPrev, "password1") os.Setenv(ca.PassphraseENVVar, "password2") require.NoError(t, c.destroyLeader()) // ensure that cluster will converge to expected number of agents, we need big timeout because of heartbeat times require.NoError(t, testutils.PollFuncWithTimeout(nil, c.pollRegister, 30*time.Second)) // Get the new leader leader, err = c.leader() assert.NoError(t, err) // check key material in store leader.m.RaftNode.MemoryStore().View(func(tx store.ReadTx) { clusters, err = store.FindClusters(tx, store.All) }) assert.NoError(t, err) assert.Len(t, clusters, 1, "there should be one cluster") assert.NotNil(t, clusters[0].RootCA.CACert) assert.NotNil(t, clusters[0].RootCA.CAKey) assert.Contains(t, string(clusters[0].RootCA.CAKey), "Proc-Type: 4,ENCRYPTED") assert.NotEqual(t, firstEncryptedKey, clusters[0].RootCA.CAKey) }
func TestClusterReelection(t *testing.T) { if !*integrationTests { t.Skip("integration test") } mCount, aCount := 5, 15 c := createManagersCluster(t, mCount, aCount) require.NoError(t, testutils.PollFunc(nil, c.pollRegister)) require.NoError(t, c.destroyLeader()) // let's down some managers in the meantime require.NoError(t, c.destroyAgents(5)) // ensure that cluster will converge to expected number of agents, we need big timeout because of heartbeat times require.NoError(t, testutils.PollFuncWithTimeout(nil, c.pollRegister, 30*time.Second)) leader, err := c.leader() assert.NoError(t, err) // check nodes in store var nodes []*api.Node leader.m.RaftNode.MemoryStore().View(func(tx store.ReadTx) { ns, err := store.FindNodes(tx, store.All) assert.NoError(t, err) for _, n := range ns { if n.Spec.Role == api.NodeRoleWorker { nodes = append(nodes, n) } } }) assert.NoError(t, err) assert.Len(t, nodes, aCount, "there should be all nodes in store") var downAgentsCount int for _, node := range nodes { if node.Status.State == api.NodeStatus_DOWN { downAgentsCount++ continue } assert.Equal(t, api.NodeStatus_READY, node.Status.State, "there should be only down and ready nodes at this point") } assert.Equal(t, 5, downAgentsCount, "unexpected number of down agents") }
func TestCanRemoveMember(t *testing.T) { nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // Stop node 2 and node 3 (2 nodes out of 3) nodes[2].Server.Stop() nodes[2].Shutdown() nodes[3].Server.Stop() nodes[3].Shutdown() // Node 2 and Node 3 should be listed as Unreachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[2].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 2 to be unreachable") } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 3 to be unreachable") } return nil })) // Removing all nodes should fail for i := 1; i <= 3; i++ { ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) err := nodes[1].RemoveMember(ctx, uint64(i)) assert.Error(t, err) assert.Equal(t, err, raft.ErrCannotRemoveMember) members := nodes[1].GetMemberlist() assert.Equal(t, len(members), 3) } // Restart node 2 and node 3 nodes[2] = raftutils.RestartNode(t, clockSource, nodes[2], false) nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false) raftutils.WaitForCluster(t, clockSource, nodes) // Node 2 and Node 3 should be listed as Reachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[2].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE { return fmt.Errorf("expected node 2 to be reachable") } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE { return fmt.Errorf("expected node 3 to be reachable") } return nil })) // Stop Node 3 (1 node out of 3) nodes[3].Server.Stop() nodes[3].Shutdown() // Node 3 should be listed as Unreachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 3 to be unreachable") } return nil })) // Removing node 2 should fail (this would break the quorum) ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) err := nodes[1].RemoveMember(ctx, nodes[2].Config.ID) assert.Error(t, err) assert.Equal(t, err, raft.ErrCannotRemoveMember) members := nodes[1].GetMemberlist() assert.Equal(t, len(members), 3) // Removing node 3 works fine because it is already unreachable ctx, _ = context.WithTimeout(context.Background(), 10*time.Second) err = nodes[1].RemoveMember(ctx, nodes[3].Config.ID) assert.NoError(t, err) members = nodes[1].GetMemberlist() assert.Nil(t, members[nodes[3].Config.ID]) assert.Equal(t, len(members), 2) // Add back node 3 raftutils.ShutdownNode(nodes[3]) delete(nodes, 3) raftutils.AddRaftNode(t, clockSource, nodes, tc) // Node 2 and Node 3 should be listed as Reachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[2].Config.ID].Status.Reachability != api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 2 to be reachable") } if members[nodes[3].Config.ID].Status.Reachability != api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 3 to be reachable") } return nil })) // Removing node 3 should succeed ctx, _ = context.WithTimeout(context.Background(), 10*time.Second) err = nodes[1].RemoveMember(ctx, nodes[3].Config.ID) assert.NoError(t, err) members = nodes[1].GetMemberlist() assert.Nil(t, members[nodes[3].Config.ID]) assert.Equal(t, len(members), 2) // Removing node 2 should succeed ctx, _ = context.WithTimeout(context.Background(), 10*time.Second) err = nodes[1].RemoveMember(ctx, nodes[2].Config.ID) assert.NoError(t, err) members = nodes[1].GetMemberlist() assert.Nil(t, members[nodes[2].Config.ID]) assert.Equal(t, len(members), 1) }
func TestGetRemoteCA(t *testing.T) { tc := testutils.NewTestCA(t) defer tc.Stop() shaHash := sha256.New() shaHash.Write(tc.RootCA.Cert) md := shaHash.Sum(nil) mdStr := hex.EncodeToString(md) d, err := digest.Parse("sha256:" + mdStr) require.NoError(t, err) downloadedRootCA, err := ca.GetRemoteCA(tc.Context, d, tc.ConnBroker) require.NoError(t, err) require.Equal(t, downloadedRootCA.Cert, tc.RootCA.Cert) // update the test CA to include a multi-certificate bundle as the root - the digest // we use to verify with must be the digest of the whole bundle tmpDir, err := ioutil.TempDir("", "GetRemoteCA") require.NoError(t, err) defer os.RemoveAll(tmpDir) paths := ca.NewConfigPaths(tmpDir) otherRootCA, err := ca.CreateRootCA("other", paths.RootCA) require.NoError(t, err) comboCertBundle := append(tc.RootCA.Cert, otherRootCA.Cert...) require.NoError(t, tc.MemoryStore.Update(func(tx store.Tx) error { cluster := store.GetCluster(tx, tc.Organization) cluster.RootCA.CACert = comboCertBundle cluster.RootCA.CAKey = tc.RootCA.Key return store.UpdateCluster(tx, cluster) })) require.NoError(t, raftutils.PollFunc(nil, func() error { _, err := ca.GetRemoteCA(tc.Context, d, tc.ConnBroker) if err == nil { return fmt.Errorf("testca's rootca hasn't updated yet") } require.Contains(t, err.Error(), "remote CA does not match fingerprint") return nil })) // If we provide the right digest, the root CA is updated and we can validate // certs signed by either one d = digest.FromBytes(comboCertBundle) downloadedRootCA, err = ca.GetRemoteCA(tc.Context, d, tc.ConnBroker) require.NoError(t, err) require.Equal(t, comboCertBundle, downloadedRootCA.Cert) require.Equal(t, 2, len(downloadedRootCA.Pool.Subjects())) for _, rootCA := range []ca.RootCA{tc.RootCA, otherRootCA} { krw := ca.NewKeyReadWriter(paths.Node, nil, nil) _, err := rootCA.IssueAndSaveNewCertificates(krw, "cn", "ou", "org") require.NoError(t, err) certPEM, _, err := krw.Read() require.NoError(t, err) cert, err := helpers.ParseCertificatesPEM(certPEM) require.NoError(t, err) chains, err := cert[0].Verify(x509.VerifyOptions{ Roots: downloadedRootCA.Pool, }) require.NoError(t, err) require.Len(t, chains, 1) } }
func TestGCWAL(t *testing.T) { t.Parallel() // Additional log entries from cluster setup, leader election extraLogEntries := 5 // Number of large entries to propose proposals := 8 // Bring up a 3 node cluster nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: uint64(proposals + extraLogEntries), LogEntriesForSlowFollowers: 0}) for i := 0; i != proposals; i++ { _, err := proposeLargeValue(t, nodes[1], DefaultProposalTime, fmt.Sprintf("id%d", i)) assert.NoError(t, err, "failed to propose value") } time.Sleep(250 * time.Millisecond) // Snapshot should have been triggered just as the WAL rotated, so // both WAL files should be preserved assert.NoError(t, raftutils.PollFunc(clockSource, func() error { dirents, err := ioutil.ReadDir(filepath.Join(nodes[1].StateDir, "snap")) if err != nil { return err } if len(dirents) != 1 { return fmt.Errorf("expected 1 snapshot, found %d", len(dirents)) } dirents, err = ioutil.ReadDir(filepath.Join(nodes[1].StateDir, "wal")) if err != nil { return err } var walCount int for _, f := range dirents { if strings.HasSuffix(f.Name(), ".wal") { walCount++ } } if walCount != 2 { return fmt.Errorf("expected 2 WAL files, found %d", walCount) } return nil })) raftutils.TeardownCluster(t, nodes) // Repeat this test, but trigger the snapshot after the WAL has rotated proposals++ nodes, clockSource = raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: uint64(proposals + extraLogEntries), LogEntriesForSlowFollowers: 0}) defer raftutils.TeardownCluster(t, nodes) for i := 0; i != proposals; i++ { _, err := proposeLargeValue(t, nodes[1], DefaultProposalTime, fmt.Sprintf("id%d", i)) assert.NoError(t, err, "failed to propose value") } time.Sleep(250 * time.Millisecond) // This time only one WAL file should be saved. assert.NoError(t, raftutils.PollFunc(clockSource, func() error { dirents, err := ioutil.ReadDir(filepath.Join(nodes[1].StateDir, "snap")) if err != nil { return err } if len(dirents) != 1 { return fmt.Errorf("expected 1 snapshot, found %d", len(dirents)) } dirents, err = ioutil.ReadDir(filepath.Join(nodes[1].StateDir, "wal")) if err != nil { return err } var walCount int for _, f := range dirents { if strings.HasSuffix(f.Name(), ".wal") { walCount++ } } if walCount != 1 { return fmt.Errorf("expected 1 WAL file, found %d", walCount) } return nil })) // Restart the whole cluster for _, node := range nodes { node.Server.Stop() node.Shutdown() } raftutils.AdvanceTicks(clockSource, 5) i := 0 for k, node := range nodes { nodes[k] = raftutils.RestartNode(t, clockSource, node, false) i++ } raftutils.WaitForCluster(t, clockSource, nodes) // Is the data intact after restart? for _, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { var err error node.MemoryStore().View(func(tx store.ReadTx) { var allNodes []*api.Node allNodes, err = store.FindNodes(tx, store.All) if err != nil { return } if len(allNodes) != proposals { err = fmt.Errorf("expected %d nodes, got %d", proposals, len(allNodes)) return } }) return err })) } // It should still be possible to propose values _, err := raftutils.ProposeValue(t, raftutils.Leader(nodes), DefaultProposalTime, "newnode") assert.NoError(t, err, "failed to propose value") for _, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { var err error node.MemoryStore().View(func(tx store.ReadTx) { var allNodes []*api.Node allNodes, err = store.FindNodes(tx, store.All) if err != nil { return } if len(allNodes) != proposals+1 { err = fmt.Errorf("expected %d nodes, got %d", proposals, len(allNodes)) return } }) return err })) } }
func TestRaftSnapshot(t *testing.T) { t.Parallel() // Bring up a 3 node cluster nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: 9, LogEntriesForSlowFollowers: 0}) defer raftutils.TeardownCluster(t, nodes) nodeIDs := []string{"id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9", "id10", "id11", "id12"} values := make([]*api.Node, len(nodeIDs)) snapshotFilenames := make(map[uint64]string, 4) // Propose 3 values var err error for i, nodeID := range nodeIDs[:3] { values[i], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeID) assert.NoError(t, err, "failed to propose value") } // None of the nodes should have snapshot files yet for _, node := range nodes { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap")) assert.NoError(t, err) assert.Len(t, dirents, 0) } // Check all nodes have all the data. // This also acts as a synchronization point so that the next value we // propose will arrive as a separate message to the raft state machine, // and it is guaranteed to have the right cluster settings when // deciding whether to create a new snapshot. raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs[:3], values) // Propose a 4th value values[3], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[3]) assert.NoError(t, err, "failed to propose value") // All nodes should now have a snapshot file for nodeID, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap")) if err != nil { return err } if len(dirents) != 1 { return fmt.Errorf("expected 1 snapshot, found %d", len(dirents)) } snapshotFilenames[nodeID] = dirents[0].Name() return nil })) } // Add a node to the cluster raftutils.AddRaftNode(t, clockSource, nodes, tc) // It should get a copy of the snapshot assert.NoError(t, raftutils.PollFunc(clockSource, func() error { dirents, err := ioutil.ReadDir(filepath.Join(nodes[4].StateDir, "snap")) if err != nil { return err } if len(dirents) != 1 { return fmt.Errorf("expected 1 snapshot, found %d on new node", len(dirents)) } snapshotFilenames[4] = dirents[0].Name() return nil })) // It should know about the other nodes stripMembers := func(memberList map[uint64]*api.RaftMember) map[uint64]*api.RaftMember { raftNodes := make(map[uint64]*api.RaftMember) for k, v := range memberList { raftNodes[k] = &api.RaftMember{ RaftID: v.RaftID, Addr: v.Addr, } } return raftNodes } assert.Equal(t, stripMembers(nodes[1].GetMemberlist()), stripMembers(nodes[4].GetMemberlist())) // All nodes should have all the data raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs[:4], values) // Propose more values to provoke a second snapshot for i := 4; i != len(nodeIDs); i++ { values[i], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[i]) assert.NoError(t, err, "failed to propose value") } // All nodes should have a snapshot under a *different* name for nodeID, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap")) if err != nil { return err } if len(dirents) != 1 { return fmt.Errorf("expected 1 snapshot, found %d on node %d", len(dirents), nodeID) } if dirents[0].Name() == snapshotFilenames[nodeID] { return fmt.Errorf("snapshot %s did not get replaced", snapshotFilenames[nodeID]) } return nil })) } // All nodes should have all the data raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values) }
func TestRaftSnapshotRestart(t *testing.T) { t.Parallel() // Bring up a 3 node cluster nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: 10, LogEntriesForSlowFollowers: 0}) defer raftutils.TeardownCluster(t, nodes) nodeIDs := []string{"id1", "id2", "id3", "id4", "id5", "id6", "id7"} values := make([]*api.Node, len(nodeIDs)) // Propose 3 values var err error for i, nodeID := range nodeIDs[:3] { values[i], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeID) assert.NoError(t, err, "failed to propose value") } // Take down node 3 nodes[3].Server.Stop() nodes[3].Shutdown() // Propose a 4th value before the snapshot values[3], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[3]) assert.NoError(t, err, "failed to propose value") // Remaining nodes shouldn't have snapshot files yet for _, node := range []*raftutils.TestNode{nodes[1], nodes[2]} { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap")) assert.NoError(t, err) assert.Len(t, dirents, 0) } // Add a node to the cluster before the snapshot. This is the event // that triggers the snapshot. nodes[4] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc) raftutils.WaitForCluster(t, clockSource, map[uint64]*raftutils.TestNode{1: nodes[1], 2: nodes[2], 4: nodes[4]}) // Remaining nodes should now have a snapshot file for nodeIdx, node := range []*raftutils.TestNode{nodes[1], nodes[2]} { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap")) if err != nil { return err } if len(dirents) != 1 { return fmt.Errorf("expected 1 snapshot, found %d on node %d", len(dirents), nodeIdx+1) } return nil })) } raftutils.CheckValuesOnNodes(t, clockSource, map[uint64]*raftutils.TestNode{1: nodes[1], 2: nodes[2]}, nodeIDs[:4], values[:4]) // Propose a 5th value values[4], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[4]) require.NoError(t, err) // Add another node to the cluster nodes[5] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc) raftutils.WaitForCluster(t, clockSource, map[uint64]*raftutils.TestNode{1: nodes[1], 2: nodes[2], 4: nodes[4], 5: nodes[5]}) // New node should get a copy of the snapshot assert.NoError(t, raftutils.PollFunc(clockSource, func() error { dirents, err := ioutil.ReadDir(filepath.Join(nodes[5].StateDir, "snap")) if err != nil { return err } if len(dirents) != 1 { return fmt.Errorf("expected 1 snapshot, found %d on new node", len(dirents)) } return nil })) dirents, err := ioutil.ReadDir(filepath.Join(nodes[5].StateDir, "snap")) assert.NoError(t, err) assert.Len(t, dirents, 1) raftutils.CheckValuesOnNodes(t, clockSource, map[uint64]*raftutils.TestNode{1: nodes[1], 2: nodes[2]}, nodeIDs[:5], values[:5]) // It should know about the other nodes, including the one that was just added stripMembers := func(memberList map[uint64]*api.RaftMember) map[uint64]*api.RaftMember { raftNodes := make(map[uint64]*api.RaftMember) for k, v := range memberList { raftNodes[k] = &api.RaftMember{ RaftID: v.RaftID, Addr: v.Addr, } } return raftNodes } assert.Equal(t, stripMembers(nodes[1].GetMemberlist()), stripMembers(nodes[4].GetMemberlist())) // Restart node 3 nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false) raftutils.WaitForCluster(t, clockSource, nodes) // Node 3 should know about other nodes, including the new one assert.Len(t, nodes[3].GetMemberlist(), 5) assert.Equal(t, stripMembers(nodes[1].GetMemberlist()), stripMembers(nodes[3].GetMemberlist())) // Propose yet another value, to make sure the rejoined node is still // receiving new logs values[5], err = raftutils.ProposeValue(t, raftutils.Leader(nodes), DefaultProposalTime, nodeIDs[5]) require.NoError(t, err) // All nodes should have all the data raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs[:6], values[:6]) // Restart node 3 again. It should load the snapshot. nodes[3].Server.Stop() nodes[3].Shutdown() nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false) raftutils.WaitForCluster(t, clockSource, nodes) assert.Len(t, nodes[3].GetMemberlist(), 5) assert.Equal(t, stripMembers(nodes[1].GetMemberlist()), stripMembers(nodes[3].GetMemberlist())) raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs[:6], values[:6]) // Propose again. Just to check consensus after this latest restart. values[6], err = raftutils.ProposeValue(t, raftutils.Leader(nodes), DefaultProposalTime, nodeIDs[6]) require.NoError(t, err) raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values) }
func TestListManagerNodes(t *testing.T) { t.Parallel() tc := cautils.NewTestCA(nil) defer tc.Stop() ts := newTestServer(t) defer ts.Stop() nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // Create a node object for each of the managers assert.NoError(t, nodes[1].MemoryStore().Update(func(tx store.Tx) error { assert.NoError(t, store.CreateNode(tx, &api.Node{ID: nodes[1].SecurityConfig.ClientTLSCreds.NodeID()})) assert.NoError(t, store.CreateNode(tx, &api.Node{ID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID()})) assert.NoError(t, store.CreateNode(tx, &api.Node{ID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID()})) return nil })) // Assign one of the raft node to the test server ts.Server.raft = nodes[1].Node ts.Server.store = nodes[1].MemoryStore() // There should be 3 reachable managers listed r, err := ts.Client.ListNodes(context.Background(), &api.ListNodesRequest{}) assert.NoError(t, err) assert.NotNil(t, r) managers := getMap(t, r.Nodes) assert.Len(t, ts.Server.raft.GetMemberlist(), 3) assert.Len(t, r.Nodes, 3) // Node 1 should be the leader for i := 1; i <= 3; i++ { if i == 1 { assert.True(t, managers[nodes[uint64(i)].Config.ID].Leader) continue } assert.False(t, managers[nodes[uint64(i)].Config.ID].Leader) } // All nodes should be reachable for i := 1; i <= 3; i++ { assert.Equal(t, api.RaftMemberStatus_REACHABLE, managers[nodes[uint64(i)].Config.ID].Reachability) } // Add two more nodes to the cluster raftutils.AddRaftNode(t, clockSource, nodes, tc) raftutils.AddRaftNode(t, clockSource, nodes, tc) raftutils.WaitForCluster(t, clockSource, nodes) // Add node entries for these assert.NoError(t, nodes[1].MemoryStore().Update(func(tx store.Tx) error { assert.NoError(t, store.CreateNode(tx, &api.Node{ID: nodes[4].SecurityConfig.ClientTLSCreds.NodeID()})) assert.NoError(t, store.CreateNode(tx, &api.Node{ID: nodes[5].SecurityConfig.ClientTLSCreds.NodeID()})) return nil })) // There should be 5 reachable managers listed r, err = ts.Client.ListNodes(context.Background(), &api.ListNodesRequest{}) assert.NoError(t, err) assert.NotNil(t, r) managers = getMap(t, r.Nodes) assert.Len(t, ts.Server.raft.GetMemberlist(), 5) assert.Len(t, r.Nodes, 5) for i := 1; i <= 5; i++ { assert.Equal(t, api.RaftMemberStatus_REACHABLE, managers[nodes[uint64(i)].Config.ID].Reachability) } // Stops 2 nodes nodes[4].Server.Stop() nodes[4].ShutdownRaft() nodes[5].Server.Stop() nodes[5].ShutdownRaft() // Node 4 and Node 5 should be listed as Unreachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { r, err = ts.Client.ListNodes(context.Background(), &api.ListNodesRequest{}) if err != nil { return err } managers = getMap(t, r.Nodes) if len(r.Nodes) != 5 { return fmt.Errorf("expected 5 nodes, got %d", len(r.Nodes)) } if managers[nodes[4].Config.ID].Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 4 to be unreachable") } if managers[nodes[5].Config.ID].Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 5 to be unreachable") } return nil })) // Restart the 2 nodes nodes[4] = raftutils.RestartNode(t, clockSource, nodes[4], false) nodes[5] = raftutils.RestartNode(t, clockSource, nodes[5], false) raftutils.WaitForCluster(t, clockSource, nodes) assert.Len(t, ts.Server.raft.GetMemberlist(), 5) // All the nodes should be reachable again assert.NoError(t, raftutils.PollFunc(clockSource, func() error { r, err = ts.Client.ListNodes(context.Background(), &api.ListNodesRequest{}) if err != nil { return err } managers = getMap(t, r.Nodes) for i := 1; i <= 5; i++ { if managers[nodes[uint64(i)].Config.ID].Reachability != api.RaftMemberStatus_REACHABLE { return fmt.Errorf("node %x is unreachable", nodes[uint64(i)].Config.ID) } } return nil })) // Switch the raft node used by the server ts.Server.raft = nodes[2].Node // Stop node 1 (leader) nodes[1].Server.Stop() nodes[1].ShutdownRaft() newCluster := map[uint64]*raftutils.TestNode{ 2: nodes[2], 3: nodes[3], 4: nodes[4], 5: nodes[5], } // Wait for the re-election to occur raftutils.WaitForCluster(t, clockSource, newCluster) // Node 1 should not be the leader anymore assert.NoError(t, raftutils.PollFunc(clockSource, func() error { r, err = ts.Client.ListNodes(context.Background(), &api.ListNodesRequest{}) if err != nil { return err } managers = getMap(t, r.Nodes) if managers[nodes[1].Config.ID].Leader { return fmt.Errorf("expected node 1 not to be the leader") } if managers[nodes[1].Config.ID].Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 1 to be unreachable") } return nil })) // Restart node 1 nodes[1].ShutdownRaft() nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], false) raftutils.WaitForCluster(t, clockSource, nodes) // Ensure that node 1 is not the leader assert.False(t, managers[nodes[uint64(1)].Config.ID].Leader) // Check that another node got the leader status var leader uint64 leaderCount := 0 for i := 1; i <= 5; i++ { if managers[nodes[uint64(i)].Config.ID].Leader { leader = nodes[uint64(i)].Config.ID leaderCount++ } } // There should be only one leader after node 1 recovery and it // should be different than node 1 assert.Equal(t, 1, leaderCount) assert.NotEqual(t, leader, nodes[1].Config.ID) }
// This test rotates the encryption key and waits for the expected thing to happen func TestRaftEncryptionKeyRotationWait(t *testing.T) { t.Parallel() nodes := make(map[uint64]*raftutils.TestNode) var clockSource *fakeclock.FakeClock raftConfig := raft.DefaultRaftConfig() nodes[1], clockSource = raftutils.NewInitNode(t, tc, &raftConfig) defer raftutils.TeardownCluster(t, nodes) nodeIDs := []string{"id1", "id2", "id3"} values := make([]*api.Node, len(nodeIDs)) // Propose 3 values var err error for i, nodeID := range nodeIDs[:3] { values[i], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeID) require.NoError(t, err, "failed to propose value") } snapDir := filepath.Join(nodes[1].StateDir, "snap-v3-encrypted") startingKeys := nodes[1].KeyRotator.GetKeys() // rotate the encryption key nodes[1].KeyRotator.QueuePendingKey([]byte("key2")) nodes[1].KeyRotator.RotationNotify() <- struct{}{} // the rotation should trigger a snapshot, which should notify the rotator when it's done require.NoError(t, raftutils.PollFunc(clockSource, func() error { snapshots, err := storage.ListSnapshots(snapDir) if err != nil { return err } if len(snapshots) != 1 { return fmt.Errorf("expected 1 snapshot, found %d on new node", len(snapshots)) } if nodes[1].KeyRotator.NeedsRotation() { return fmt.Errorf("rotation never finished") } return nil })) raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values) // Propose a 4th value nodeIDs = append(nodeIDs, "id4") v, err := raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, "id4") require.NoError(t, err, "failed to propose value") values = append(values, v) raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values) nodes[1].Server.Stop() nodes[1].ShutdownRaft() // Try to restart node 1. Without the new unlock key, it can't actually start n, ctx := raftutils.CopyNode(t, clockSource, nodes[1], false, raftutils.NewSimpleKeyRotator(startingKeys)) require.Error(t, n.Node.JoinAndStart(ctx), "should not have been able to restart since we can't read snapshots") // with the right key, it can start, even if the right key is only the pending key newKeys := startingKeys newKeys.PendingDEK = []byte("key2") nodes[1].KeyRotator = raftutils.NewSimpleKeyRotator(newKeys) nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], false) raftutils.WaitForCluster(t, clockSource, nodes) // as soon as we joined, it should have finished rotating the key require.False(t, nodes[1].KeyRotator.NeedsRotation()) raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values) // break snapshotting, and ensure that key rotation never finishes tempSnapDir := filepath.Join(nodes[1].StateDir, "snap-backup") require.NoError(t, os.Rename(snapDir, tempSnapDir)) require.NoError(t, ioutil.WriteFile(snapDir, []byte("this is no longer a directory"), 0644)) nodes[1].KeyRotator.QueuePendingKey([]byte("key3")) nodes[1].KeyRotator.RotationNotify() <- struct{}{} time.Sleep(250 * time.Millisecond) // rotation has not been finished, because we cannot take a snapshot require.True(t, nodes[1].KeyRotator.NeedsRotation()) // Propose a 5th value, so we have WALs written with the new key nodeIDs = append(nodeIDs, "id5") v, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, "id5") require.NoError(t, err, "failed to propose value") values = append(values, v) raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values) nodes[1].Server.Stop() nodes[1].ShutdownRaft() // restore the snapshot dir require.NoError(t, os.RemoveAll(snapDir)) require.NoError(t, os.Rename(tempSnapDir, snapDir)) // Now the wals are a mix of key2 and key3 - we can't actually start with either key singleKey := raft.EncryptionKeys{CurrentDEK: []byte("key2")} n, ctx = raftutils.CopyNode(t, clockSource, nodes[1], false, raftutils.NewSimpleKeyRotator(singleKey)) require.Error(t, n.Node.JoinAndStart(ctx), "should not have been able to restart since we can't read all the WALs, even if we can read the snapshot") singleKey = raft.EncryptionKeys{CurrentDEK: []byte("key3")} n, ctx = raftutils.CopyNode(t, clockSource, nodes[1], false, raftutils.NewSimpleKeyRotator(singleKey)) require.Error(t, n.Node.JoinAndStart(ctx), "should not have been able to restart since we can't read all the WALs, and also not the snapshot") nodes[1], ctx = raftutils.CopyNode(t, clockSource, nodes[1], false, raftutils.NewSimpleKeyRotator(raft.EncryptionKeys{ CurrentDEK: []byte("key2"), PendingDEK: []byte("key3"), })) require.NoError(t, nodes[1].Node.JoinAndStart(ctx)) // we can load, but we still need a snapshot because rotation hasn't finished snapshots, err := storage.ListSnapshots(snapDir) require.NoError(t, err) require.Len(t, snapshots, 1, "expected 1 snapshot") require.True(t, nodes[1].KeyRotator.NeedsRotation()) currSnapshot := snapshots[0] // start the node - everything should fix itself go nodes[1].Node.Run(ctx) raftutils.WaitForCluster(t, clockSource, nodes) require.NoError(t, raftutils.PollFunc(clockSource, func() error { snapshots, err := storage.ListSnapshots(snapDir) if err != nil { return err } if len(snapshots) != 1 { return fmt.Errorf("expected 1 snapshots, found %d on new node", len(snapshots)) } if snapshots[0] == currSnapshot { return fmt.Errorf("new snapshot not done yet") } if nodes[1].KeyRotator.NeedsRotation() { return fmt.Errorf("rotation never finished") } currSnapshot = snapshots[0] return nil })) raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values) // If we can't update the keys, we wait for the next snapshot to do so nodes[1].KeyRotator.SetUpdateFunc(func() error { return fmt.Errorf("nope!") }) nodes[1].KeyRotator.QueuePendingKey([]byte("key4")) nodes[1].KeyRotator.RotationNotify() <- struct{}{} require.NoError(t, raftutils.PollFunc(clockSource, func() error { snapshots, err := storage.ListSnapshots(snapDir) if err != nil { return err } if len(snapshots) != 1 { return fmt.Errorf("expected 1 snapshots, found %d on new node", len(snapshots)) } if snapshots[0] == currSnapshot { return fmt.Errorf("new snapshot not done yet") } currSnapshot = snapshots[0] return nil })) require.True(t, nodes[1].KeyRotator.NeedsRotation()) // Fix updating the key rotator, and propose a 6th value - this should trigger the key // rotation to finish nodes[1].KeyRotator.SetUpdateFunc(nil) nodeIDs = append(nodeIDs, "id6") v, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, "id6") require.NoError(t, err, "failed to propose value") values = append(values, v) raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values) require.NoError(t, raftutils.PollFunc(clockSource, func() error { if nodes[1].KeyRotator.NeedsRotation() { return fmt.Errorf("rotation never finished") } return nil })) // no new snapshot snapshots, err = storage.ListSnapshots(snapDir) require.NoError(t, err) require.Len(t, snapshots, 1) require.Equal(t, currSnapshot, snapshots[0]) // Even if something goes wrong with getting keys, and needs rotation returns a false positive, // if there's no PendingDEK nothing happens. fakeTrue := true nodes[1].KeyRotator.SetNeedsRotation(&fakeTrue) nodes[1].KeyRotator.RotationNotify() <- struct{}{} // propose another value nodeIDs = append(nodeIDs, "id7") v, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, "id7") require.NoError(t, err, "failed to propose value") values = append(values, v) raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values) // no new snapshot snapshots, err = storage.ListSnapshots(snapDir) require.NoError(t, err) require.Len(t, snapshots, 1) require.Equal(t, currSnapshot, snapshots[0]) // and when we restart, we can restart with the original key (the WAL written for the new proposed value) // is written with the old key nodes[1].Server.Stop() nodes[1].ShutdownRaft() nodes[1].KeyRotator = raftutils.NewSimpleKeyRotator(raft.EncryptionKeys{ CurrentDEK: []byte("key4"), }) nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], false) raftutils.WaitForCluster(t, clockSource, nodes) raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values) }
func TestUpdateNodeDemote(t *testing.T) { tc := cautils.NewTestCA(nil, cautils.AcceptancePolicy(true, true, "")) ts := newTestServer(t) nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // Assign one of the raft node to the test server ts.Server.raft = nodes[1].Node ts.Server.store = nodes[1].MemoryStore() // Create a node object for each of the managers assert.NoError(t, nodes[1].MemoryStore().Update(func(tx store.Tx) error { assert.NoError(t, store.CreateNode(tx, &api.Node{ ID: nodes[1].SecurityConfig.ClientTLSCreds.NodeID(), Spec: api.NodeSpec{ Role: api.NodeRoleManager, Membership: api.NodeMembershipAccepted, }, })) assert.NoError(t, store.CreateNode(tx, &api.Node{ ID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID(), Spec: api.NodeSpec{ Role: api.NodeRoleManager, Membership: api.NodeMembershipAccepted, }, })) assert.NoError(t, store.CreateNode(tx, &api.Node{ ID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID(), Spec: api.NodeSpec{ Role: api.NodeRoleManager, Membership: api.NodeMembershipAccepted, }, })) return nil })) // Stop Node 3 (1 node out of 3) nodes[3].Server.Stop() nodes[3].Shutdown() // Node 3 should be listed as Unreachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 3 to be unreachable") } return nil })) // Try to demote Node 2, this should fail because of the quorum safeguard r, err := ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID()}) assert.NoError(t, err) spec := r.Node.Spec.Copy() spec.Role = api.NodeRoleWorker version := &r.Node.Meta.Version _, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID(), Spec: spec, NodeVersion: version, }) assert.Error(t, err) assert.Equal(t, codes.FailedPrecondition, grpc.Code(err)) // Restart Node 3 nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false) raftutils.WaitForCluster(t, clockSource, nodes) // Node 3 should be listed as Reachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE { return fmt.Errorf("expected node 3 to be reachable") } return nil })) // Try to demote Node 3, this should succeed r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID()}) assert.NoError(t, err) spec = r.Node.Spec.Copy() spec.Role = api.NodeRoleWorker version = &r.Node.Meta.Version _, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID(), Spec: spec, NodeVersion: version, }) assert.NoError(t, err) newCluster := map[uint64]*raftutils.TestNode{ 1: nodes[1], 2: nodes[2], } raftutils.WaitForCluster(t, clockSource, newCluster) // Server should list 2 members assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 2 { return fmt.Errorf("expected 2 nodes, got %d", len(members)) } return nil })) // Try to demote Node 2 r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID()}) assert.NoError(t, err) spec = r.Node.Spec.Copy() spec.Role = api.NodeRoleWorker version = &r.Node.Meta.Version _, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID(), Spec: spec, NodeVersion: version, }) assert.NoError(t, err) newCluster = map[uint64]*raftutils.TestNode{ 1: nodes[1], } raftutils.WaitForCluster(t, clockSource, newCluster) // New server should list 1 member assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 1 { return fmt.Errorf("expected 1 node, got %d", len(members)) } return nil })) // Make sure we can't demote the last manager. r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[1].SecurityConfig.ClientTLSCreds.NodeID()}) assert.NoError(t, err) spec = r.Node.Spec.Copy() spec.Role = api.NodeRoleWorker version = &r.Node.Meta.Version _, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: nodes[1].SecurityConfig.ClientTLSCreds.NodeID(), Spec: spec, NodeVersion: version, }) assert.Error(t, err) assert.Equal(t, codes.FailedPrecondition, grpc.Code(err)) }
func TestRaftForceNewCluster(t *testing.T) { t.Parallel() nodes, clockSource := raftutils.NewRaftCluster(t, tc) // Propose a value values := make([]*api.Node, 2) var err error values[0], err = raftutils.ProposeValue(t, nodes[1], "id1") assert.NoError(t, err, "failed to propose value") // The memberlist should contain 3 members on each node for i := 1; i <= 3; i++ { assert.Equal(t, len(nodes[uint64(i)].GetMemberlist()), 3) } // Stop all nodes for _, node := range nodes { node.Server.Stop() node.Shutdown() } raftutils.AdvanceTicks(clockSource, 5) toClean := map[uint64]*raftutils.TestNode{ 2: nodes[2], 3: nodes[3], } raftutils.TeardownCluster(t, toClean) delete(nodes, 2) delete(nodes, 3) // Only restart the first node with force-new-cluster option nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], true) raftutils.WaitForCluster(t, clockSource, nodes) // The memberlist should contain only one node (self) assert.Equal(t, len(nodes[1].GetMemberlist()), 1) // Add 2 more members nodes[2] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc) raftutils.WaitForCluster(t, clockSource, nodes) nodes[3] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc) raftutils.WaitForCluster(t, clockSource, nodes) newCluster := map[uint64]*raftutils.TestNode{ 1: nodes[1], 2: nodes[2], 3: nodes[3], } defer raftutils.TeardownCluster(t, newCluster) // The memberlist should contain 3 members on each node for i := 1; i <= 3; i++ { assert.Equal(t, len(nodes[uint64(i)].GetMemberlist()), 3) } // Propose another value values[1], err = raftutils.ProposeValue(t, raftutils.Leader(nodes), "id2") assert.NoError(t, err, "failed to propose value") for _, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { var err error node.MemoryStore().View(func(tx store.ReadTx) { var allNodes []*api.Node allNodes, err = store.FindNodes(tx, store.All) if err != nil { return } if len(allNodes) != 2 { err = fmt.Errorf("expected 2 nodes, got %d", len(allNodes)) return } for i, nodeID := range []string{"id1", "id2"} { n := store.GetNode(tx, nodeID) if !reflect.DeepEqual(n, values[i]) { err = fmt.Errorf("node %s did not match expected value", nodeID) return } } }) return err })) } }
func testUpdateNodeDemote(leader bool, t *testing.T) { tc := cautils.NewTestCA(nil) defer tc.Stop() ts := newTestServer(t) defer ts.Stop() nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // Assign one of the raft node to the test server ts.Server.raft = nodes[1].Node ts.Server.store = nodes[1].MemoryStore() // Create a node object for each of the managers assert.NoError(t, nodes[1].MemoryStore().Update(func(tx store.Tx) error { assert.NoError(t, store.CreateNode(tx, &api.Node{ ID: nodes[1].SecurityConfig.ClientTLSCreds.NodeID(), Spec: api.NodeSpec{ Role: api.NodeRoleManager, Membership: api.NodeMembershipAccepted, }, })) assert.NoError(t, store.CreateNode(tx, &api.Node{ ID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID(), Spec: api.NodeSpec{ Role: api.NodeRoleManager, Membership: api.NodeMembershipAccepted, }, })) assert.NoError(t, store.CreateNode(tx, &api.Node{ ID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID(), Spec: api.NodeSpec{ Role: api.NodeRoleManager, Membership: api.NodeMembershipAccepted, }, })) return nil })) // Stop Node 3 (1 node out of 3) nodes[3].Server.Stop() nodes[3].ShutdownRaft() // Node 3 should be listed as Unreachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 3 to be unreachable") } return nil })) // Try to demote Node 2, this should fail because of the quorum safeguard r, err := ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID()}) assert.NoError(t, err) spec := r.Node.Spec.Copy() spec.Role = api.NodeRoleWorker version := &r.Node.Meta.Version _, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID(), Spec: spec, NodeVersion: version, }) assert.Error(t, err) assert.Equal(t, codes.FailedPrecondition, grpc.Code(err)) // Restart Node 3 nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false) raftutils.WaitForCluster(t, clockSource, nodes) // Node 3 should be listed as Reachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE { return fmt.Errorf("expected node 3 to be reachable") } return nil })) // Try to demote Node 3, this should succeed r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID()}) assert.NoError(t, err) spec = r.Node.Spec.Copy() spec.Role = api.NodeRoleWorker version = &r.Node.Meta.Version _, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID(), Spec: spec, NodeVersion: version, }) assert.NoError(t, err) newCluster := map[uint64]*raftutils.TestNode{ 1: nodes[1], 2: nodes[2], } raftutils.WaitForCluster(t, clockSource, newCluster) // Server should list 2 members assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 2 { return fmt.Errorf("expected 2 nodes, got %d", len(members)) } return nil })) var demoteNode, lastNode *raftutils.TestNode if leader { demoteNode = nodes[1] lastNode = nodes[2] } else { demoteNode = nodes[2] lastNode = nodes[1] } // Try to demote a Node and scale down to 1 r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: demoteNode.SecurityConfig.ClientTLSCreds.NodeID()}) assert.NoError(t, err) spec = r.Node.Spec.Copy() spec.Role = api.NodeRoleWorker version = &r.Node.Meta.Version _, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: demoteNode.SecurityConfig.ClientTLSCreds.NodeID(), Spec: spec, NodeVersion: version, }) assert.NoError(t, err) // Update the server ts.Server.raft = lastNode.Node ts.Server.store = lastNode.MemoryStore() newCluster = map[uint64]*raftutils.TestNode{ 1: lastNode, } raftutils.WaitForCluster(t, clockSource, newCluster) assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := lastNode.GetMemberlist() if len(members) != 1 { return fmt.Errorf("expected 1 node, got %d", len(members)) } return nil })) // Make sure we can't demote the last manager. r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: lastNode.SecurityConfig.ClientTLSCreds.NodeID()}) assert.NoError(t, err) spec = r.Node.Spec.Copy() spec.Role = api.NodeRoleWorker version = &r.Node.Meta.Version _, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: lastNode.SecurityConfig.ClientTLSCreds.NodeID(), Spec: spec, NodeVersion: version, }) assert.Error(t, err) assert.Equal(t, codes.FailedPrecondition, grpc.Code(err)) // Propose a change in the spec and check if the remaining node can still process updates r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: lastNode.SecurityConfig.ClientTLSCreds.NodeID()}) assert.NoError(t, err) spec = r.Node.Spec.Copy() spec.Availability = api.NodeAvailabilityDrain version = &r.Node.Meta.Version _, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{ NodeID: lastNode.SecurityConfig.ClientTLSCreds.NodeID(), Spec: spec, NodeVersion: version, }) assert.NoError(t, err) // Get node information and check that the availability is set to drain r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: lastNode.SecurityConfig.ClientTLSCreds.NodeID()}) assert.NoError(t, err) assert.Equal(t, r.Node.Spec.Availability, api.NodeAvailabilityDrain) }
func TestNewNodeCertificateRequiresToken(t *testing.T) { t.Parallel() tc := testutils.NewTestCA(t) defer tc.Stop() csr, _, err := ca.GenerateNewCSR() assert.NoError(t, err) // Issuance fails if no secret is provided role := api.NodeRoleManager issueRequest := &api.IssueNodeCertificateRequest{CSR: csr, Role: role} _, err = tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.EqualError(t, err, "rpc error: code = 3 desc = A valid join token is necessary to join this cluster") role = api.NodeRoleWorker issueRequest = &api.IssueNodeCertificateRequest{CSR: csr, Role: role} _, err = tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.EqualError(t, err, "rpc error: code = 3 desc = A valid join token is necessary to join this cluster") // Issuance fails if wrong secret is provided role = api.NodeRoleManager issueRequest = &api.IssueNodeCertificateRequest{CSR: csr, Role: role, Token: "invalid-secret"} _, err = tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.EqualError(t, err, "rpc error: code = 3 desc = A valid join token is necessary to join this cluster") role = api.NodeRoleWorker issueRequest = &api.IssueNodeCertificateRequest{CSR: csr, Role: role, Token: "invalid-secret"} _, err = tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.EqualError(t, err, "rpc error: code = 3 desc = A valid join token is necessary to join this cluster") // Issuance succeeds if correct token is provided role = api.NodeRoleManager issueRequest = &api.IssueNodeCertificateRequest{CSR: csr, Role: role, Token: tc.ManagerToken} _, err = tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.NoError(t, err) role = api.NodeRoleWorker issueRequest = &api.IssueNodeCertificateRequest{CSR: csr, Role: role, Token: tc.WorkerToken} _, err = tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.NoError(t, err) // Rotate manager and worker tokens var ( newManagerToken string newWorkerToken string ) assert.NoError(t, tc.MemoryStore.Update(func(tx store.Tx) error { clusters, _ := store.FindClusters(tx, store.ByName(store.DefaultClusterName)) newWorkerToken = ca.GenerateJoinToken(&tc.RootCA) clusters[0].RootCA.JoinTokens.Worker = newWorkerToken newManagerToken = ca.GenerateJoinToken(&tc.RootCA) clusters[0].RootCA.JoinTokens.Manager = newManagerToken return store.UpdateCluster(tx, clusters[0]) })) // updating the join token may take a little bit in order to register on the CA server, so poll assert.NoError(t, raftutils.PollFunc(nil, func() error { // Old token should fail role = api.NodeRoleManager issueRequest = &api.IssueNodeCertificateRequest{CSR: csr, Role: role, Token: tc.ManagerToken} _, err = tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) if err == nil { return fmt.Errorf("join token not updated yet") } return nil })) // Old token should fail assert.EqualError(t, err, "rpc error: code = 3 desc = A valid join token is necessary to join this cluster") role = api.NodeRoleWorker issueRequest = &api.IssueNodeCertificateRequest{CSR: csr, Role: role, Token: tc.WorkerToken} _, err = tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.EqualError(t, err, "rpc error: code = 3 desc = A valid join token is necessary to join this cluster") // New token should succeed role = api.NodeRoleManager issueRequest = &api.IssueNodeCertificateRequest{CSR: csr, Role: role, Token: newManagerToken} _, err = tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.NoError(t, err) role = api.NodeRoleWorker issueRequest = &api.IssueNodeCertificateRequest{CSR: csr, Role: role, Token: newWorkerToken} _, err = tc.NodeCAClients[0].IssueNodeCertificate(context.Background(), issueRequest) assert.NoError(t, err) }
func TestRaftSnapshotForceNewCluster(t *testing.T) { t.Parallel() // Bring up a 3 node cluster nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: 10, LogEntriesForSlowFollowers: 0}) defer raftutils.TeardownCluster(t, nodes) nodeIDs := []string{"id1", "id2", "id3", "id4", "id5"} // Propose 3 values. for _, nodeID := range nodeIDs[:3] { _, err := raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeID) assert.NoError(t, err, "failed to propose value") } // Remove one of the original nodes // Use gRPC instead of calling handler directly because of // authorization check. cc, err := dial(nodes[1], nodes[1].Address) assert.NoError(t, err) raftClient := api.NewRaftMembershipClient(cc) defer cc.Close() ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[2].Config.ID}}) assert.NoError(t, err, "error sending message to leave the raft") assert.NotNil(t, resp, "leave response message is nil") raftutils.ShutdownNode(nodes[2]) delete(nodes, 2) // Nodes shouldn't have snapshot files yet for _, node := range nodes { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap-v3-encrypted")) assert.NoError(t, err) assert.Len(t, dirents, 0) } // Trigger a snapshot, with a 4th proposal _, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[3]) assert.NoError(t, err, "failed to propose value") // Nodes should now have a snapshot file for nodeIdx, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap-v3-encrypted")) if err != nil { return err } if len(dirents) != 1 { return fmt.Errorf("expected 1 snapshot, found %d on node %d", len(dirents), nodeIdx+1) } return nil })) } // Join another node nodes[4] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc) raftutils.WaitForCluster(t, clockSource, nodes) // Only restart the first node with force-new-cluster option nodes[1].Server.Stop() nodes[1].ShutdownRaft() nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], true) delete(nodes, 3) delete(nodes, 4) raftutils.WaitForCluster(t, clockSource, nodes) // The memberlist should contain exactly one node (self) memberlist := nodes[1].GetMemberlist() require.Len(t, memberlist, 1) // Propose a 5th value _, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[4]) require.NoError(t, err) }