Esempio n. 1
0
func TestRaftQuorumRecovery(t *testing.T) {
	t.Parallel()

	// Bring up a 5 nodes cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Lose a majority
	for i := uint64(1); i <= 3; i++ {
		nodes[i].Server.Stop()
		nodes[i].ShutdownRaft()
	}

	raftutils.AdvanceTicks(clockSource, 5)

	// Restore the majority by restarting node 3
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)

	raftutils.ShutdownNode(nodes[1])
	delete(nodes, 1)
	raftutils.ShutdownNode(nodes[2])
	delete(nodes, 2)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Propose a value
	value, err := raftutils.ProposeValue(t, raftutils.Leader(nodes), DefaultProposalTime)
	assert.NoError(t, err)

	for _, node := range nodes {
		raftutils.CheckValue(t, clockSource, node, value)
	}
}
Esempio n. 2
0
func TestRaftJoinWithIncorrectAddress(t *testing.T) {
	t.Parallel()

	nodes := make(map[uint64]*raftutils.TestNode)
	var clockSource *fakeclock.FakeClock
	nodes[1], clockSource = raftutils.NewInitNode(t, tc, nil)
	defer raftutils.ShutdownNode(nodes[1])

	// Try joining a new node with an incorrect address
	n := raftutils.NewNode(t, clockSource, tc, raft.NodeOptions{JoinAddr: nodes[1].Address, Addr: "1.2.3.4:1234"})
	defer raftutils.CleanupNonRunningNode(n)

	err := n.JoinAndStart(context.Background())
	assert.NotNil(t, err)
	assert.Contains(t, grpc.ErrorDesc(err), "could not connect to prospective new cluster member using its advertised address")

	// Check if first node still has only itself registered in the memberlist
	assert.Len(t, nodes[1].GetMemberlist(), 1)
}
Esempio n. 3
0
func TestRaftFollowerLeave(t *testing.T) {
	t.Parallel()

	// Bring up a 5 nodes cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Node 5 leaves the cluster
	// Use gRPC instead of calling handler directly because of
	// authorization check.
	cc, err := dial(nodes[1], nodes[1].Address)
	assert.NoError(t, err)
	raftClient := api.NewRaftMembershipClient(cc)
	defer cc.Close()
	ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
	resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[5].Config.ID}})
	assert.NoError(t, err, "error sending message to leave the raft")
	assert.NotNil(t, resp, "leave response message is nil")

	raftutils.ShutdownNode(nodes[5])
	delete(nodes, 5)

	raftutils.WaitForPeerNumber(t, clockSource, nodes, 4)

	// Propose a value
	value, err := raftutils.ProposeValue(t, nodes[1], DefaultProposalTime)
	assert.NoError(t, err, "failed to propose value")

	// Value should be replicated on every node
	raftutils.CheckValue(t, clockSource, nodes[1], value)
	assert.Len(t, nodes[1].GetMemberlist(), 4)

	raftutils.CheckValue(t, clockSource, nodes[2], value)
	assert.Len(t, nodes[2].GetMemberlist(), 4)

	raftutils.CheckValue(t, clockSource, nodes[3], value)
	assert.Len(t, nodes[3].GetMemberlist(), 4)

	raftutils.CheckValue(t, clockSource, nodes[4], value)
	assert.Len(t, nodes[4].GetMemberlist(), 4)
}
Esempio n. 4
0
func TestRaftJoinWithIncorrectAddress(t *testing.T) {
	t.Parallel()

	nodes := make(map[uint64]*raftutils.TestNode)
	var clockSource *fakeclock.FakeClock
	nodes[1], clockSource = raftutils.NewInitNode(t, tc, nil)
	defer raftutils.ShutdownNode(nodes[1])

	// Try joining a new node with an incorrect address
	n := raftutils.NewNode(t, clockSource, tc, raft.NewNodeOptions{JoinAddr: nodes[1].Address, Addr: "1.2.3.4:1234"})
	defer raftutils.CleanupNonRunningNode(n)

	err := n.JoinAndStart()
	assert.NotNil(t, err)
	assert.Equal(t, grpc.ErrorDesc(err), raft.ErrHealthCheckFailure.Error())

	// Check if first node still has only itself registered in the memberlist
	assert.Equal(t, len(nodes[1].GetMemberlist()), 1)
}
Esempio n. 5
0
func TestCanRemoveMember(t *testing.T) {
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Stop node 2 and node 3 (2 nodes out of 3)
	nodes[2].Server.Stop()
	nodes[2].Shutdown()
	nodes[3].Server.Stop()
	nodes[3].Shutdown()

	// Node 2 and Node 3 should be listed as Unreachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[2].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 2 to be unreachable")
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 3 to be unreachable")
		}
		return nil
	}))

	// Removing all nodes should fail
	for i := 1; i <= 3; i++ {
		ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
		err := nodes[1].RemoveMember(ctx, uint64(i))
		assert.Error(t, err)
		assert.Equal(t, err, raft.ErrCannotRemoveMember)
		members := nodes[1].GetMemberlist()
		assert.Equal(t, len(members), 3)
	}

	// Restart node 2 and node 3
	nodes[2] = raftutils.RestartNode(t, clockSource, nodes[2], false)
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Node 2 and Node 3 should be listed as Reachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[2].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE {
			return fmt.Errorf("expected node 2 to be reachable")
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE {
			return fmt.Errorf("expected node 3 to be reachable")
		}
		return nil
	}))

	// Stop Node 3 (1 node out of 3)
	nodes[3].Server.Stop()
	nodes[3].Shutdown()

	// Node 3 should be listed as Unreachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 3 to be unreachable")
		}
		return nil
	}))

	// Removing node 2 should fail (this would break the quorum)
	ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
	err := nodes[1].RemoveMember(ctx, nodes[2].Config.ID)
	assert.Error(t, err)
	assert.Equal(t, err, raft.ErrCannotRemoveMember)
	members := nodes[1].GetMemberlist()
	assert.Equal(t, len(members), 3)

	// Removing node 3 works fine because it is already unreachable
	ctx, _ = context.WithTimeout(context.Background(), 10*time.Second)
	err = nodes[1].RemoveMember(ctx, nodes[3].Config.ID)
	assert.NoError(t, err)
	members = nodes[1].GetMemberlist()
	assert.Nil(t, members[nodes[3].Config.ID])
	assert.Equal(t, len(members), 2)

	// Add back node 3
	raftutils.ShutdownNode(nodes[3])
	delete(nodes, 3)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)

	// Node 2 and Node 3 should be listed as Reachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[2].Config.ID].Status.Reachability != api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 2 to be reachable")
		}
		if members[nodes[3].Config.ID].Status.Reachability != api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 3 to be reachable")
		}
		return nil
	}))

	// Removing node 3 should succeed
	ctx, _ = context.WithTimeout(context.Background(), 10*time.Second)
	err = nodes[1].RemoveMember(ctx, nodes[3].Config.ID)
	assert.NoError(t, err)
	members = nodes[1].GetMemberlist()
	assert.Nil(t, members[nodes[3].Config.ID])
	assert.Equal(t, len(members), 2)

	// Removing node 2 should succeed
	ctx, _ = context.WithTimeout(context.Background(), 10*time.Second)
	err = nodes[1].RemoveMember(ctx, nodes[2].Config.ID)
	assert.NoError(t, err)
	members = nodes[1].GetMemberlist()
	assert.Nil(t, members[nodes[2].Config.ID])
	assert.Equal(t, len(members), 1)
}
Esempio n. 6
0
func TestRaftSnapshotForceNewCluster(t *testing.T) {
	t.Parallel()

	// Bring up a 3 node cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: 10, LogEntriesForSlowFollowers: 0})
	defer raftutils.TeardownCluster(t, nodes)

	nodeIDs := []string{"id1", "id2", "id3", "id4", "id5"}

	// Propose 3 values.
	for _, nodeID := range nodeIDs[:3] {
		_, err := raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeID)
		assert.NoError(t, err, "failed to propose value")
	}

	// Remove one of the original nodes

	// Use gRPC instead of calling handler directly because of
	// authorization check.
	cc, err := dial(nodes[1], nodes[1].Address)
	assert.NoError(t, err)
	raftClient := api.NewRaftMembershipClient(cc)
	defer cc.Close()
	ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
	resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[2].Config.ID}})
	assert.NoError(t, err, "error sending message to leave the raft")
	assert.NotNil(t, resp, "leave response message is nil")

	raftutils.ShutdownNode(nodes[2])
	delete(nodes, 2)

	// Nodes shouldn't have snapshot files yet
	for _, node := range nodes {
		dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap-v3-encrypted"))
		assert.NoError(t, err)
		assert.Len(t, dirents, 0)
	}

	// Trigger a snapshot, with a 4th proposal
	_, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[3])
	assert.NoError(t, err, "failed to propose value")

	// Nodes should now have a snapshot file
	for nodeIdx, node := range nodes {
		assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
			dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap-v3-encrypted"))
			if err != nil {
				return err
			}
			if len(dirents) != 1 {
				return fmt.Errorf("expected 1 snapshot, found %d on node %d", len(dirents), nodeIdx+1)
			}
			return nil
		}))
	}

	// Join another node
	nodes[4] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Only restart the first node with force-new-cluster option
	nodes[1].Server.Stop()
	nodes[1].ShutdownRaft()
	nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], true)
	delete(nodes, 3)
	delete(nodes, 4)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// The memberlist should contain exactly one node (self)
	memberlist := nodes[1].GetMemberlist()
	require.Len(t, memberlist, 1)

	// Propose a 5th value
	_, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[4])
	require.NoError(t, err)
}