func TestRaftQuorumRecovery(t *testing.T) { t.Parallel() // Bring up a 5 nodes cluster nodes, clockSource := raftutils.NewRaftCluster(t, tc) raftutils.AddRaftNode(t, clockSource, nodes, tc) raftutils.AddRaftNode(t, clockSource, nodes, tc) defer raftutils.TeardownCluster(t, nodes) // Lose a majority for i := uint64(1); i <= 3; i++ { nodes[i].Server.Stop() nodes[i].ShutdownRaft() } raftutils.AdvanceTicks(clockSource, 5) // Restore the majority by restarting node 3 nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false) raftutils.ShutdownNode(nodes[1]) delete(nodes, 1) raftutils.ShutdownNode(nodes[2]) delete(nodes, 2) raftutils.WaitForCluster(t, clockSource, nodes) // Propose a value value, err := raftutils.ProposeValue(t, raftutils.Leader(nodes), DefaultProposalTime) assert.NoError(t, err) for _, node := range nodes { raftutils.CheckValue(t, clockSource, node, value) } }
func TestRaftJoinWithIncorrectAddress(t *testing.T) { t.Parallel() nodes := make(map[uint64]*raftutils.TestNode) var clockSource *fakeclock.FakeClock nodes[1], clockSource = raftutils.NewInitNode(t, tc, nil) defer raftutils.ShutdownNode(nodes[1]) // Try joining a new node with an incorrect address n := raftutils.NewNode(t, clockSource, tc, raft.NodeOptions{JoinAddr: nodes[1].Address, Addr: "1.2.3.4:1234"}) defer raftutils.CleanupNonRunningNode(n) err := n.JoinAndStart(context.Background()) assert.NotNil(t, err) assert.Contains(t, grpc.ErrorDesc(err), "could not connect to prospective new cluster member using its advertised address") // Check if first node still has only itself registered in the memberlist assert.Len(t, nodes[1].GetMemberlist(), 1) }
func TestRaftFollowerLeave(t *testing.T) { t.Parallel() // Bring up a 5 nodes cluster nodes, clockSource := raftutils.NewRaftCluster(t, tc) raftutils.AddRaftNode(t, clockSource, nodes, tc) raftutils.AddRaftNode(t, clockSource, nodes, tc) defer raftutils.TeardownCluster(t, nodes) // Node 5 leaves the cluster // Use gRPC instead of calling handler directly because of // authorization check. cc, err := dial(nodes[1], nodes[1].Address) assert.NoError(t, err) raftClient := api.NewRaftMembershipClient(cc) defer cc.Close() ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[5].Config.ID}}) assert.NoError(t, err, "error sending message to leave the raft") assert.NotNil(t, resp, "leave response message is nil") raftutils.ShutdownNode(nodes[5]) delete(nodes, 5) raftutils.WaitForPeerNumber(t, clockSource, nodes, 4) // Propose a value value, err := raftutils.ProposeValue(t, nodes[1], DefaultProposalTime) assert.NoError(t, err, "failed to propose value") // Value should be replicated on every node raftutils.CheckValue(t, clockSource, nodes[1], value) assert.Len(t, nodes[1].GetMemberlist(), 4) raftutils.CheckValue(t, clockSource, nodes[2], value) assert.Len(t, nodes[2].GetMemberlist(), 4) raftutils.CheckValue(t, clockSource, nodes[3], value) assert.Len(t, nodes[3].GetMemberlist(), 4) raftutils.CheckValue(t, clockSource, nodes[4], value) assert.Len(t, nodes[4].GetMemberlist(), 4) }
func TestRaftJoinWithIncorrectAddress(t *testing.T) { t.Parallel() nodes := make(map[uint64]*raftutils.TestNode) var clockSource *fakeclock.FakeClock nodes[1], clockSource = raftutils.NewInitNode(t, tc, nil) defer raftutils.ShutdownNode(nodes[1]) // Try joining a new node with an incorrect address n := raftutils.NewNode(t, clockSource, tc, raft.NewNodeOptions{JoinAddr: nodes[1].Address, Addr: "1.2.3.4:1234"}) defer raftutils.CleanupNonRunningNode(n) err := n.JoinAndStart() assert.NotNil(t, err) assert.Equal(t, grpc.ErrorDesc(err), raft.ErrHealthCheckFailure.Error()) // Check if first node still has only itself registered in the memberlist assert.Equal(t, len(nodes[1].GetMemberlist()), 1) }
func TestCanRemoveMember(t *testing.T) { nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // Stop node 2 and node 3 (2 nodes out of 3) nodes[2].Server.Stop() nodes[2].Shutdown() nodes[3].Server.Stop() nodes[3].Shutdown() // Node 2 and Node 3 should be listed as Unreachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[2].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 2 to be unreachable") } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 3 to be unreachable") } return nil })) // Removing all nodes should fail for i := 1; i <= 3; i++ { ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) err := nodes[1].RemoveMember(ctx, uint64(i)) assert.Error(t, err) assert.Equal(t, err, raft.ErrCannotRemoveMember) members := nodes[1].GetMemberlist() assert.Equal(t, len(members), 3) } // Restart node 2 and node 3 nodes[2] = raftutils.RestartNode(t, clockSource, nodes[2], false) nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false) raftutils.WaitForCluster(t, clockSource, nodes) // Node 2 and Node 3 should be listed as Reachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[2].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE { return fmt.Errorf("expected node 2 to be reachable") } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE { return fmt.Errorf("expected node 3 to be reachable") } return nil })) // Stop Node 3 (1 node out of 3) nodes[3].Server.Stop() nodes[3].Shutdown() // Node 3 should be listed as Unreachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 3 to be unreachable") } return nil })) // Removing node 2 should fail (this would break the quorum) ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) err := nodes[1].RemoveMember(ctx, nodes[2].Config.ID) assert.Error(t, err) assert.Equal(t, err, raft.ErrCannotRemoveMember) members := nodes[1].GetMemberlist() assert.Equal(t, len(members), 3) // Removing node 3 works fine because it is already unreachable ctx, _ = context.WithTimeout(context.Background(), 10*time.Second) err = nodes[1].RemoveMember(ctx, nodes[3].Config.ID) assert.NoError(t, err) members = nodes[1].GetMemberlist() assert.Nil(t, members[nodes[3].Config.ID]) assert.Equal(t, len(members), 2) // Add back node 3 raftutils.ShutdownNode(nodes[3]) delete(nodes, 3) raftutils.AddRaftNode(t, clockSource, nodes, tc) // Node 2 and Node 3 should be listed as Reachable assert.NoError(t, raftutils.PollFunc(clockSource, func() error { members := nodes[1].GetMemberlist() if len(members) != 3 { return fmt.Errorf("expected 3 nodes, got %d", len(members)) } if members[nodes[2].Config.ID].Status.Reachability != api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 2 to be reachable") } if members[nodes[3].Config.ID].Status.Reachability != api.RaftMemberStatus_REACHABLE { return fmt.Errorf("expected node 3 to be reachable") } return nil })) // Removing node 3 should succeed ctx, _ = context.WithTimeout(context.Background(), 10*time.Second) err = nodes[1].RemoveMember(ctx, nodes[3].Config.ID) assert.NoError(t, err) members = nodes[1].GetMemberlist() assert.Nil(t, members[nodes[3].Config.ID]) assert.Equal(t, len(members), 2) // Removing node 2 should succeed ctx, _ = context.WithTimeout(context.Background(), 10*time.Second) err = nodes[1].RemoveMember(ctx, nodes[2].Config.ID) assert.NoError(t, err) members = nodes[1].GetMemberlist() assert.Nil(t, members[nodes[2].Config.ID]) assert.Equal(t, len(members), 1) }
func TestRaftSnapshotForceNewCluster(t *testing.T) { t.Parallel() // Bring up a 3 node cluster nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: 10, LogEntriesForSlowFollowers: 0}) defer raftutils.TeardownCluster(t, nodes) nodeIDs := []string{"id1", "id2", "id3", "id4", "id5"} // Propose 3 values. for _, nodeID := range nodeIDs[:3] { _, err := raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeID) assert.NoError(t, err, "failed to propose value") } // Remove one of the original nodes // Use gRPC instead of calling handler directly because of // authorization check. cc, err := dial(nodes[1], nodes[1].Address) assert.NoError(t, err) raftClient := api.NewRaftMembershipClient(cc) defer cc.Close() ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[2].Config.ID}}) assert.NoError(t, err, "error sending message to leave the raft") assert.NotNil(t, resp, "leave response message is nil") raftutils.ShutdownNode(nodes[2]) delete(nodes, 2) // Nodes shouldn't have snapshot files yet for _, node := range nodes { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap-v3-encrypted")) assert.NoError(t, err) assert.Len(t, dirents, 0) } // Trigger a snapshot, with a 4th proposal _, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[3]) assert.NoError(t, err, "failed to propose value") // Nodes should now have a snapshot file for nodeIdx, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap-v3-encrypted")) if err != nil { return err } if len(dirents) != 1 { return fmt.Errorf("expected 1 snapshot, found %d on node %d", len(dirents), nodeIdx+1) } return nil })) } // Join another node nodes[4] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc) raftutils.WaitForCluster(t, clockSource, nodes) // Only restart the first node with force-new-cluster option nodes[1].Server.Stop() nodes[1].ShutdownRaft() nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], true) delete(nodes, 3) delete(nodes, 4) raftutils.WaitForCluster(t, clockSource, nodes) // The memberlist should contain exactly one node (self) memberlist := nodes[1].GetMemberlist() require.Len(t, memberlist, 1) // Propose a 5th value _, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[4]) require.NoError(t, err) }