Exemple #1
0
func TestRaftQuorumRecovery(t *testing.T) {
	t.Parallel()

	// Bring up a 5 nodes cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Lose a majority
	for i := uint64(1); i <= 3; i++ {
		nodes[i].Server.Stop()
		nodes[i].Shutdown()
	}

	raftutils.AdvanceTicks(clockSource, 5)

	// Restore the majority by restarting node 3
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)

	delete(nodes, 1)
	delete(nodes, 2)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Propose a value
	value, err := raftutils.ProposeValue(t, raftutils.Leader(nodes))
	assert.NoError(t, err)

	for _, node := range nodes {
		raftutils.CheckValue(t, clockSource, node, value)
	}
}
Exemple #2
0
func testRaftRestartCluster(t *testing.T, stagger bool) {
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Propose a value
	values := make([]*api.Node, 2)
	var err error
	values[0], err = raftutils.ProposeValue(t, nodes[1], "id1")
	assert.NoError(t, err, "failed to propose value")

	// Stop all nodes
	for _, node := range nodes {
		node.Server.Stop()
		node.Shutdown()
	}

	raftutils.AdvanceTicks(clockSource, 5)

	// Restart all nodes
	i := 0
	for k, node := range nodes {
		if stagger && i != 0 {
			raftutils.AdvanceTicks(clockSource, 1)
		}
		nodes[k] = raftutils.RestartNode(t, clockSource, node, false)
		i++
	}
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Propose another value
	values[1], err = raftutils.ProposeValue(t, raftutils.Leader(nodes), "id2")
	assert.NoError(t, err, "failed to propose value")

	for _, node := range nodes {
		assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
			var err error
			node.MemoryStore().View(func(tx store.ReadTx) {
				var allNodes []*api.Node
				allNodes, err = store.FindNodes(tx, store.All)
				if err != nil {
					return
				}
				if len(allNodes) != 2 {
					err = fmt.Errorf("expected 2 nodes, got %d", len(allNodes))
					return
				}

				for i, nodeID := range []string{"id1", "id2"} {
					n := store.GetNode(tx, nodeID)
					if !reflect.DeepEqual(n, values[i]) {
						err = fmt.Errorf("node %s did not match expected value", nodeID)
						return
					}
				}
			})
			return err
		}))
	}
}
Exemple #3
0
func TestRaftBootstrap(t *testing.T) {
	t.Parallel()

	nodes, _ := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	assert.Equal(t, 3, len(nodes[1].GetMemberlist()))
	assert.Equal(t, 3, len(nodes[2].GetMemberlist()))
	assert.Equal(t, 3, len(nodes[3].GetMemberlist()))
}
Exemple #4
0
// This test rotates the encryption key and restarts the node - the intent is try to trigger
// race conditions if there is more than one node and hence consensus may take longer.
func TestRaftEncryptionKeyRotationStress(t *testing.T) {
	t.Parallel()

	// Bring up a 3 nodes cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)
	leader := nodes[1]

	// constantly propose values
	done, stop, restart, clusterReady := make(chan struct{}), make(chan struct{}), make(chan struct{}), make(chan struct{})
	go func() {
		counter := len(nodes)
		for {
			select {
			case <-stop:
				close(done)
				return
			case <-restart:
				// the node restarts may trigger a leadership change, so wait until the cluster has 3
				// nodes again and a leader is selected before proposing more values
				<-clusterReady
				leader = raftutils.Leader(nodes)
			default:
				counter += 1
				raftutils.ProposeValue(t, leader, DefaultProposalTime, fmt.Sprintf("id%d", counter))
			}
		}
	}()

	for i := 0; i < 30; i++ {
		// rotate the encryption key
		nodes[3].KeyRotator.QueuePendingKey([]byte(fmt.Sprintf("newKey%d", i)))
		nodes[3].KeyRotator.RotationNotify() <- struct{}{}

		require.NoError(t, raftutils.PollFunc(clockSource, func() error {
			if nodes[3].KeyRotator.GetKeys().PendingDEK == nil {
				return nil
			}
			return fmt.Errorf("not done rotating yet")
		}))

		// restart the node and wait for everything to settle and a leader to be elected
		nodes[3].Server.Stop()
		nodes[3].ShutdownRaft()
		restart <- struct{}{}
		nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)
		raftutils.AdvanceTicks(clockSource, 1)

		raftutils.WaitForCluster(t, clockSource, nodes)
		clusterReady <- struct{}{}
	}

	close(stop)
	<-done
}
Exemple #5
0
func TestCanRemoveMember(t *testing.T) {
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Stop node 2 and node 3 (2 nodes out of 3)
	nodes[2].Server.Stop()
	nodes[2].Shutdown()
	nodes[3].Server.Stop()
	nodes[3].Shutdown()

	// Node 2 and Node 3 should be listed as Unreachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[2].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 2 to be unreachable")
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 3 to be unreachable")
		}
		return nil
	}))

	// Removing node 3 should fail
	ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
	err := nodes[1].RemoveMember(ctx, 3)
	assert.Error(t, err)
	assert.Equal(t, err, raft.ErrCannotRemoveMember)
	members := nodes[1].GetMemberlist()
	assert.Equal(t, len(members), 3)

	// Restart node 2 and node 3
	nodes[2] = raftutils.RestartNode(t, clockSource, nodes[2], false)
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Removing node 3 should succeed
	ctx, _ = context.WithTimeout(context.Background(), 10*time.Second)
	err = nodes[1].RemoveMember(ctx, nodes[3].Config.ID)
	assert.NoError(t, err)
	members = nodes[1].GetMemberlist()
	assert.Nil(t, members[nodes[3].Config.ID])
	assert.Equal(t, len(members), 2)

	// Removing node 2 should fail
	ctx, _ = context.WithTimeout(context.Background(), 10*time.Second)
	err = nodes[1].RemoveMember(ctx, nodes[2].Config.ID)
	assert.Error(t, err)
	assert.Equal(t, err, raft.ErrCannotRemoveMember)
	assert.Equal(t, len(members), 2)
}
Exemple #6
0
func TestRaftLeaderDown(t *testing.T) {
	t.Parallel()

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Stop node 1
	nodes[1].Stop()

	newCluster := map[uint64]*raftutils.TestNode{
		2: nodes[2],
		3: nodes[3],
	}
	// Wait for the re-election to occur
	raftutils.WaitForCluster(t, clockSource, newCluster)

	// Leader should not be 1
	assert.NotEqual(t, nodes[2].Leader(), nodes[1].Config.ID)

	// Ensure that node 2 and node 3 have the same leader
	assert.Equal(t, nodes[3].Leader(), nodes[2].Leader())

	// Find the leader node and a follower node
	var (
		leaderNode   *raftutils.TestNode
		followerNode *raftutils.TestNode
	)
	for i, n := range newCluster {
		if n.Config.ID == n.Leader() {
			leaderNode = n
			if i == 2 {
				followerNode = newCluster[3]
			} else {
				followerNode = newCluster[2]
			}
		}
	}

	require.NotNil(t, leaderNode)
	require.NotNil(t, followerNode)

	// Propose a value
	value, err := raftutils.ProposeValue(t, leaderNode)
	assert.NoError(t, err, "failed to propose value")

	// The value should be replicated on all remaining nodes
	raftutils.CheckValue(t, clockSource, leaderNode, value)
	assert.Equal(t, len(leaderNode.GetMemberlist()), 3)

	raftutils.CheckValue(t, clockSource, followerNode, value)
	assert.Equal(t, len(followerNode.GetMemberlist()), 3)
}
Exemple #7
0
func TestRaftLeader(t *testing.T) {
	t.Parallel()

	nodes, _ := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	assert.True(t, nodes[1].IsLeader(), "error: node 1 is not the Leader")

	// nodes should all have the same leader
	assert.Equal(t, nodes[1].Leader(), nodes[1].Config.ID)
	assert.Equal(t, nodes[2].Leader(), nodes[1].Config.ID)
	assert.Equal(t, nodes[3].Leader(), nodes[1].Config.ID)
}
Exemple #8
0
func TestRaftLogReplication(t *testing.T) {
	t.Parallel()

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Propose a value
	value, err := raftutils.ProposeValue(t, nodes[1])
	assert.NoError(t, err, "failed to propose value")

	// All nodes should have the value in the physical store
	raftutils.CheckValue(t, clockSource, nodes[1], value)
	raftutils.CheckValue(t, clockSource, nodes[2], value)
	raftutils.CheckValue(t, clockSource, nodes[3], value)
}
Exemple #9
0
func TestRaftLogReplicationWithoutLeader(t *testing.T) {
	t.Parallel()
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Stop the leader
	nodes[1].Stop()

	// Propose a value
	_, err := raftutils.ProposeValue(t, nodes[2])
	assert.Error(t, err)

	// No value should be replicated in the store in the absence of the leader
	raftutils.CheckNoValue(t, clockSource, nodes[2])
	raftutils.CheckNoValue(t, clockSource, nodes[3])
}
Exemple #10
0
func TestRaftFollowerLeave(t *testing.T) {
	t.Parallel()

	// Bring up a 5 nodes cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Node 5 leaves the cluster
	// Use gRPC instead of calling handler directly because of
	// authorization check.
	cc, err := dial(nodes[1], nodes[1].Address)
	assert.NoError(t, err)
	raftClient := api.NewRaftMembershipClient(cc)
	defer cc.Close()
	ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
	resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[5].Config.ID}})
	assert.NoError(t, err, "error sending message to leave the raft")
	assert.NotNil(t, resp, "leave response message is nil")

	raftutils.ShutdownNode(nodes[5])
	delete(nodes, 5)

	raftutils.WaitForPeerNumber(t, clockSource, nodes, 4)

	// Propose a value
	value, err := raftutils.ProposeValue(t, nodes[1], DefaultProposalTime)
	assert.NoError(t, err, "failed to propose value")

	// Value should be replicated on every node
	raftutils.CheckValue(t, clockSource, nodes[1], value)
	assert.Len(t, nodes[1].GetMemberlist(), 4)

	raftutils.CheckValue(t, clockSource, nodes[2], value)
	assert.Len(t, nodes[2].GetMemberlist(), 4)

	raftutils.CheckValue(t, clockSource, nodes[3], value)
	assert.Len(t, nodes[3].GetMemberlist(), 4)

	raftutils.CheckValue(t, clockSource, nodes[4], value)
	assert.Len(t, nodes[4].GetMemberlist(), 4)
}
Exemple #11
0
func TestRaftJoinTwice(t *testing.T) {
	t.Parallel()

	nodes, _ := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Node 3 tries to join again
	// Use gRPC instead of calling handler directly because of
	// authorization check.
	cc, err := dial(nodes[3], nodes[1].Address)
	assert.NoError(t, err)
	raftClient := api.NewRaftMembershipClient(cc)
	defer cc.Close()
	ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
	_, err = raftClient.Join(ctx, &api.JoinRequest{})
	assert.Error(t, err, "expected error on duplicate Join")
	assert.Equal(t, grpc.Code(err), codes.AlreadyExists)
	assert.Equal(t, grpc.ErrorDesc(err), "a raft member with this node ID already exists")
}
Exemple #12
0
func TestRaftWipedState(t *testing.T) {
	t.Parallel()

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Stop node 3
	nodes[3].Server.Stop()
	nodes[3].ShutdownRaft()

	// Remove its state
	os.RemoveAll(nodes[3].StateDir)

	raftutils.AdvanceTicks(clockSource, 5)

	// Restart node 3
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)

	// Make sure this doesn't panic.
	raftutils.PollFuncWithTimeout(clockSource, func() error { return errors.New("keep the poll going") }, time.Second)
}
Exemple #13
0
func TestRaftNewNodeGetsData(t *testing.T) {
	t.Parallel()

	// Bring up a 3 node cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Propose a value
	value, err := raftutils.ProposeValue(t, nodes[1])
	assert.NoError(t, err, "failed to propose value")

	// Add a new node
	raftutils.AddRaftNode(t, clockSource, nodes, tc)

	time.Sleep(500 * time.Millisecond)

	// Value should be replicated on every node
	for _, node := range nodes {
		raftutils.CheckValue(t, clockSource, node, value)
		assert.Equal(t, len(node.GetMemberlist()), 4)
	}
}
Exemple #14
0
func TestRaftQuorumFailure(t *testing.T) {
	t.Parallel()

	// Bring up a 5 nodes cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Lose a majority
	for i := uint64(3); i <= 5; i++ {
		nodes[i].Server.Stop()
		nodes[i].Stop()
	}

	// Propose a value
	_, err := raftutils.ProposeValue(t, nodes[1])
	assert.Error(t, err)

	// The value should not be replicated, we have no majority
	raftutils.CheckNoValue(t, clockSource, nodes[2])
	raftutils.CheckNoValue(t, clockSource, nodes[1])
}
Exemple #15
0
func TestRaftFollowerDown(t *testing.T) {
	t.Parallel()

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Stop node 3
	nodes[3].Stop()

	// Leader should still be 1
	assert.True(t, nodes[1].IsLeader(), "node 1 is not a leader anymore")
	assert.Equal(t, nodes[2].Leader(), nodes[1].Config.ID)

	// Propose a value
	value, err := raftutils.ProposeValue(t, nodes[1])
	assert.NoError(t, err, "failed to propose value")

	// The value should be replicated on all remaining nodes
	raftutils.CheckValue(t, clockSource, nodes[1], value)
	assert.Equal(t, len(nodes[1].GetMemberlist()), 3)

	raftutils.CheckValue(t, clockSource, nodes[2], value)
	assert.Equal(t, len(nodes[2].GetMemberlist()), 3)
}
Exemple #16
0
func TestRaftRejoin(t *testing.T) {
	t.Parallel()

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	ids := []string{"id1", "id2"}

	// Propose a value
	values := make([]*api.Node, 2)
	var err error
	values[0], err = raftutils.ProposeValue(t, nodes[1], ids[0])
	assert.NoError(t, err, "failed to propose value")

	// The value should be replicated on node 3
	raftutils.CheckValue(t, clockSource, nodes[3], values[0])
	assert.Equal(t, len(nodes[3].GetMemberlist()), 3)

	// Stop node 3
	nodes[3].Server.Stop()
	nodes[3].Shutdown()

	// Propose another value
	values[1], err = raftutils.ProposeValue(t, nodes[1], ids[1])
	assert.NoError(t, err, "failed to propose value")

	// Nodes 1 and 2 should have the new value
	raftutils.CheckValuesOnNodes(t, clockSource, map[uint64]*raftutils.TestNode{1: nodes[1], 2: nodes[2]}, ids, values)

	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Node 3 should have all values, including the one proposed while
	// it was unavailable.
	raftutils.CheckValuesOnNodes(t, clockSource, nodes, ids, values)
}
Exemple #17
0
func TestRaftForceNewCluster(t *testing.T) {
	t.Parallel()

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)

	// Propose a value
	values := make([]*api.Node, 2)
	var err error
	values[0], err = raftutils.ProposeValue(t, nodes[1], "id1")
	assert.NoError(t, err, "failed to propose value")

	// The memberlist should contain 3 members on each node
	for i := 1; i <= 3; i++ {
		assert.Equal(t, len(nodes[uint64(i)].GetMemberlist()), 3)
	}

	// Stop all nodes
	for _, node := range nodes {
		node.Server.Stop()
		node.Shutdown()
	}

	raftutils.AdvanceTicks(clockSource, 5)

	toClean := map[uint64]*raftutils.TestNode{
		2: nodes[2],
		3: nodes[3],
	}
	raftutils.TeardownCluster(t, toClean)
	delete(nodes, 2)
	delete(nodes, 3)

	// Only restart the first node with force-new-cluster option
	nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], true)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// The memberlist should contain only one node (self)
	assert.Equal(t, len(nodes[1].GetMemberlist()), 1)

	// Add 2 more members
	nodes[2] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc)
	raftutils.WaitForCluster(t, clockSource, nodes)

	nodes[3] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc)
	raftutils.WaitForCluster(t, clockSource, nodes)

	newCluster := map[uint64]*raftutils.TestNode{
		1: nodes[1],
		2: nodes[2],
		3: nodes[3],
	}
	defer raftutils.TeardownCluster(t, newCluster)

	// The memberlist should contain 3 members on each node
	for i := 1; i <= 3; i++ {
		assert.Equal(t, len(nodes[uint64(i)].GetMemberlist()), 3)
	}

	// Propose another value
	values[1], err = raftutils.ProposeValue(t, raftutils.Leader(nodes), "id2")
	assert.NoError(t, err, "failed to propose value")

	for _, node := range nodes {
		assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
			var err error
			node.MemoryStore().View(func(tx store.ReadTx) {
				var allNodes []*api.Node
				allNodes, err = store.FindNodes(tx, store.All)
				if err != nil {
					return
				}
				if len(allNodes) != 2 {
					err = fmt.Errorf("expected 2 nodes, got %d", len(allNodes))
					return
				}

				for i, nodeID := range []string{"id1", "id2"} {
					n := store.GetNode(tx, nodeID)
					if !reflect.DeepEqual(n, values[i]) {
						err = fmt.Errorf("node %s did not match expected value", nodeID)
						return
					}
				}
			})
			return err
		}))
	}
}
Exemple #18
0
func TestListManagerNodes(t *testing.T) {
	t.Parallel()

	tc := cautils.NewTestCA(nil)
	defer tc.Stop()
	ts := newTestServer(t)
	defer ts.Stop()

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Create a node object for each of the managers
	assert.NoError(t, nodes[1].MemoryStore().Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateNode(tx, &api.Node{ID: nodes[1].SecurityConfig.ClientTLSCreds.NodeID()}))
		assert.NoError(t, store.CreateNode(tx, &api.Node{ID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID()}))
		assert.NoError(t, store.CreateNode(tx, &api.Node{ID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID()}))
		return nil
	}))

	// Assign one of the raft node to the test server
	ts.Server.raft = nodes[1].Node
	ts.Server.store = nodes[1].MemoryStore()

	// There should be 3 reachable managers listed
	r, err := ts.Client.ListNodes(context.Background(), &api.ListNodesRequest{})
	assert.NoError(t, err)
	assert.NotNil(t, r)
	managers := getMap(t, r.Nodes)
	assert.Len(t, ts.Server.raft.GetMemberlist(), 3)
	assert.Len(t, r.Nodes, 3)

	// Node 1 should be the leader
	for i := 1; i <= 3; i++ {
		if i == 1 {
			assert.True(t, managers[nodes[uint64(i)].Config.ID].Leader)
			continue
		}
		assert.False(t, managers[nodes[uint64(i)].Config.ID].Leader)
	}

	// All nodes should be reachable
	for i := 1; i <= 3; i++ {
		assert.Equal(t, api.RaftMemberStatus_REACHABLE, managers[nodes[uint64(i)].Config.ID].Reachability)
	}

	// Add two more nodes to the cluster
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Add node entries for these
	assert.NoError(t, nodes[1].MemoryStore().Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateNode(tx, &api.Node{ID: nodes[4].SecurityConfig.ClientTLSCreds.NodeID()}))
		assert.NoError(t, store.CreateNode(tx, &api.Node{ID: nodes[5].SecurityConfig.ClientTLSCreds.NodeID()}))
		return nil
	}))

	// There should be 5 reachable managers listed
	r, err = ts.Client.ListNodes(context.Background(), &api.ListNodesRequest{})
	assert.NoError(t, err)
	assert.NotNil(t, r)
	managers = getMap(t, r.Nodes)
	assert.Len(t, ts.Server.raft.GetMemberlist(), 5)
	assert.Len(t, r.Nodes, 5)
	for i := 1; i <= 5; i++ {
		assert.Equal(t, api.RaftMemberStatus_REACHABLE, managers[nodes[uint64(i)].Config.ID].Reachability)
	}

	// Stops 2 nodes
	nodes[4].Server.Stop()
	nodes[4].ShutdownRaft()
	nodes[5].Server.Stop()
	nodes[5].ShutdownRaft()

	// Node 4 and Node 5 should be listed as Unreachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		r, err = ts.Client.ListNodes(context.Background(), &api.ListNodesRequest{})
		if err != nil {
			return err
		}

		managers = getMap(t, r.Nodes)

		if len(r.Nodes) != 5 {
			return fmt.Errorf("expected 5 nodes, got %d", len(r.Nodes))
		}

		if managers[nodes[4].Config.ID].Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 4 to be unreachable")
		}

		if managers[nodes[5].Config.ID].Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 5 to be unreachable")
		}

		return nil
	}))

	// Restart the 2 nodes
	nodes[4] = raftutils.RestartNode(t, clockSource, nodes[4], false)
	nodes[5] = raftutils.RestartNode(t, clockSource, nodes[5], false)
	raftutils.WaitForCluster(t, clockSource, nodes)

	assert.Len(t, ts.Server.raft.GetMemberlist(), 5)
	// All the nodes should be reachable again
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		r, err = ts.Client.ListNodes(context.Background(), &api.ListNodesRequest{})
		if err != nil {
			return err
		}
		managers = getMap(t, r.Nodes)
		for i := 1; i <= 5; i++ {
			if managers[nodes[uint64(i)].Config.ID].Reachability != api.RaftMemberStatus_REACHABLE {
				return fmt.Errorf("node %x is unreachable", nodes[uint64(i)].Config.ID)
			}
		}
		return nil
	}))

	// Switch the raft node used by the server
	ts.Server.raft = nodes[2].Node

	// Stop node 1 (leader)
	nodes[1].Server.Stop()
	nodes[1].ShutdownRaft()

	newCluster := map[uint64]*raftutils.TestNode{
		2: nodes[2],
		3: nodes[3],
		4: nodes[4],
		5: nodes[5],
	}

	// Wait for the re-election to occur
	raftutils.WaitForCluster(t, clockSource, newCluster)

	// Node 1 should not be the leader anymore
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		r, err = ts.Client.ListNodes(context.Background(), &api.ListNodesRequest{})
		if err != nil {
			return err
		}

		managers = getMap(t, r.Nodes)

		if managers[nodes[1].Config.ID].Leader {
			return fmt.Errorf("expected node 1 not to be the leader")
		}

		if managers[nodes[1].Config.ID].Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 1 to be unreachable")
		}

		return nil
	}))

	// Restart node 1
	nodes[1].ShutdownRaft()
	nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], false)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Ensure that node 1 is not the leader
	assert.False(t, managers[nodes[uint64(1)].Config.ID].Leader)

	// Check that another node got the leader status
	var leader uint64
	leaderCount := 0
	for i := 1; i <= 5; i++ {
		if managers[nodes[uint64(i)].Config.ID].Leader {
			leader = nodes[uint64(i)].Config.ID
			leaderCount++
		}
	}

	// There should be only one leader after node 1 recovery and it
	// should be different than node 1
	assert.Equal(t, 1, leaderCount)
	assert.NotEqual(t, leader, nodes[1].Config.ID)
}
Exemple #19
0
func testUpdateNodeDemote(leader bool, t *testing.T) {
	tc := cautils.NewTestCA(nil)
	defer tc.Stop()
	ts := newTestServer(t)
	defer ts.Stop()

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Assign one of the raft node to the test server
	ts.Server.raft = nodes[1].Node
	ts.Server.store = nodes[1].MemoryStore()

	// Create a node object for each of the managers
	assert.NoError(t, nodes[1].MemoryStore().Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateNode(tx, &api.Node{
			ID: nodes[1].SecurityConfig.ClientTLSCreds.NodeID(),
			Spec: api.NodeSpec{
				Role:       api.NodeRoleManager,
				Membership: api.NodeMembershipAccepted,
			},
		}))
		assert.NoError(t, store.CreateNode(tx, &api.Node{
			ID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID(),
			Spec: api.NodeSpec{
				Role:       api.NodeRoleManager,
				Membership: api.NodeMembershipAccepted,
			},
		}))
		assert.NoError(t, store.CreateNode(tx, &api.Node{
			ID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID(),
			Spec: api.NodeSpec{
				Role:       api.NodeRoleManager,
				Membership: api.NodeMembershipAccepted,
			},
		}))
		return nil
	}))

	// Stop Node 3 (1 node out of 3)
	nodes[3].Server.Stop()
	nodes[3].ShutdownRaft()

	// Node 3 should be listed as Unreachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 3 to be unreachable")
		}
		return nil
	}))

	// Try to demote Node 2, this should fail because of the quorum safeguard
	r, err := ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID()})
	assert.NoError(t, err)
	spec := r.Node.Spec.Copy()
	spec.Role = api.NodeRoleWorker
	version := &r.Node.Meta.Version
	_, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{
		NodeID:      nodes[2].SecurityConfig.ClientTLSCreds.NodeID(),
		Spec:        spec,
		NodeVersion: version,
	})
	assert.Error(t, err)
	assert.Equal(t, codes.FailedPrecondition, grpc.Code(err))

	// Restart Node 3
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Node 3 should be listed as Reachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE {
			return fmt.Errorf("expected node 3 to be reachable")
		}
		return nil
	}))

	// Try to demote Node 3, this should succeed
	r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID()})
	assert.NoError(t, err)
	spec = r.Node.Spec.Copy()
	spec.Role = api.NodeRoleWorker
	version = &r.Node.Meta.Version
	_, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{
		NodeID:      nodes[3].SecurityConfig.ClientTLSCreds.NodeID(),
		Spec:        spec,
		NodeVersion: version,
	})
	assert.NoError(t, err)

	newCluster := map[uint64]*raftutils.TestNode{
		1: nodes[1],
		2: nodes[2],
	}

	raftutils.WaitForCluster(t, clockSource, newCluster)

	// Server should list 2 members
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 2 {
			return fmt.Errorf("expected 2 nodes, got %d", len(members))
		}
		return nil
	}))

	var demoteNode, lastNode *raftutils.TestNode
	if leader {
		demoteNode = nodes[1]
		lastNode = nodes[2]
	} else {
		demoteNode = nodes[2]
		lastNode = nodes[1]
	}

	// Try to demote a Node and scale down to 1
	r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: demoteNode.SecurityConfig.ClientTLSCreds.NodeID()})
	assert.NoError(t, err)
	spec = r.Node.Spec.Copy()
	spec.Role = api.NodeRoleWorker
	version = &r.Node.Meta.Version
	_, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{
		NodeID:      demoteNode.SecurityConfig.ClientTLSCreds.NodeID(),
		Spec:        spec,
		NodeVersion: version,
	})
	assert.NoError(t, err)

	// Update the server
	ts.Server.raft = lastNode.Node
	ts.Server.store = lastNode.MemoryStore()

	newCluster = map[uint64]*raftutils.TestNode{
		1: lastNode,
	}

	raftutils.WaitForCluster(t, clockSource, newCluster)

	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := lastNode.GetMemberlist()
		if len(members) != 1 {
			return fmt.Errorf("expected 1 node, got %d", len(members))
		}
		return nil
	}))

	// Make sure we can't demote the last manager.
	r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: lastNode.SecurityConfig.ClientTLSCreds.NodeID()})
	assert.NoError(t, err)
	spec = r.Node.Spec.Copy()
	spec.Role = api.NodeRoleWorker
	version = &r.Node.Meta.Version
	_, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{
		NodeID:      lastNode.SecurityConfig.ClientTLSCreds.NodeID(),
		Spec:        spec,
		NodeVersion: version,
	})
	assert.Error(t, err)
	assert.Equal(t, codes.FailedPrecondition, grpc.Code(err))

	// Propose a change in the spec and check if the remaining node can still process updates
	r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: lastNode.SecurityConfig.ClientTLSCreds.NodeID()})
	assert.NoError(t, err)
	spec = r.Node.Spec.Copy()
	spec.Availability = api.NodeAvailabilityDrain
	version = &r.Node.Meta.Version
	_, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{
		NodeID:      lastNode.SecurityConfig.ClientTLSCreds.NodeID(),
		Spec:        spec,
		NodeVersion: version,
	})
	assert.NoError(t, err)

	// Get node information and check that the availability is set to drain
	r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: lastNode.SecurityConfig.ClientTLSCreds.NodeID()})
	assert.NoError(t, err)
	assert.Equal(t, r.Node.Spec.Availability, api.NodeAvailabilityDrain)

}
Exemple #20
0
func TestRaftLeaderLeave(t *testing.T) {
	t.Parallel()

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)

	// node 1 is the leader
	assert.Equal(t, nodes[1].Leader(), nodes[1].Config.ID)

	// Try to leave the raft
	// Use gRPC instead of calling handler directly because of
	// authorization check.
	client, err := nodes[1].ConnectToMember(nodes[1].Address, 10*time.Second)
	assert.NoError(t, err)
	defer client.Conn.Close()
	raftClient := api.NewRaftMembershipClient(client.Conn)
	ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
	resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[1].Config.ID}})
	assert.NoError(t, err, "error sending message to leave the raft")
	assert.NotNil(t, resp, "leave response message is nil")

	newCluster := map[uint64]*raftutils.TestNode{
		2: nodes[2],
		3: nodes[3],
	}
	// Wait for election tick
	raftutils.WaitForCluster(t, clockSource, newCluster)

	// Leader should not be 1
	assert.NotEqual(t, nodes[2].Leader(), nodes[1].Config.ID)
	assert.Equal(t, nodes[2].Leader(), nodes[3].Leader())

	leader := nodes[2].Leader()

	// Find the leader node and a follower node
	var (
		leaderNode   *raftutils.TestNode
		followerNode *raftutils.TestNode
	)
	for i, n := range nodes {
		if n.Config.ID == leader {
			leaderNode = n
			if i == 2 {
				followerNode = nodes[3]
			} else {
				followerNode = nodes[2]
			}
		}
	}

	require.NotNil(t, leaderNode)
	require.NotNil(t, followerNode)

	// Propose a value
	value, err := raftutils.ProposeValue(t, leaderNode)
	assert.NoError(t, err, "failed to propose value")

	// The value should be replicated on all remaining nodes
	raftutils.CheckValue(t, clockSource, leaderNode, value)
	assert.Equal(t, len(leaderNode.GetMemberlist()), 2)

	raftutils.CheckValue(t, clockSource, followerNode, value)
	assert.Equal(t, len(followerNode.GetMemberlist()), 2)

	raftutils.TeardownCluster(t, newCluster)
}
Exemple #21
0
func TestCanRemoveMember(t *testing.T) {
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Stop node 2 and node 3 (2 nodes out of 3)
	nodes[2].Server.Stop()
	nodes[2].Shutdown()
	nodes[3].Server.Stop()
	nodes[3].Shutdown()

	// Node 2 and Node 3 should be listed as Unreachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[2].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 2 to be unreachable")
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 3 to be unreachable")
		}
		return nil
	}))

	// Removing all nodes should fail
	for i := 1; i <= 3; i++ {
		ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
		err := nodes[1].RemoveMember(ctx, uint64(i))
		assert.Error(t, err)
		assert.Equal(t, err, raft.ErrCannotRemoveMember)
		members := nodes[1].GetMemberlist()
		assert.Equal(t, len(members), 3)
	}

	// Restart node 2 and node 3
	nodes[2] = raftutils.RestartNode(t, clockSource, nodes[2], false)
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Node 2 and Node 3 should be listed as Reachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[2].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE {
			return fmt.Errorf("expected node 2 to be reachable")
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE {
			return fmt.Errorf("expected node 3 to be reachable")
		}
		return nil
	}))

	// Stop Node 3 (1 node out of 3)
	nodes[3].Server.Stop()
	nodes[3].Shutdown()

	// Node 3 should be listed as Unreachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 3 to be unreachable")
		}
		return nil
	}))

	// Removing node 2 should fail (this would break the quorum)
	ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
	err := nodes[1].RemoveMember(ctx, nodes[2].Config.ID)
	assert.Error(t, err)
	assert.Equal(t, err, raft.ErrCannotRemoveMember)
	members := nodes[1].GetMemberlist()
	assert.Equal(t, len(members), 3)

	// Removing node 3 works fine because it is already unreachable
	ctx, _ = context.WithTimeout(context.Background(), 10*time.Second)
	err = nodes[1].RemoveMember(ctx, nodes[3].Config.ID)
	assert.NoError(t, err)
	members = nodes[1].GetMemberlist()
	assert.Nil(t, members[nodes[3].Config.ID])
	assert.Equal(t, len(members), 2)

	// Add back node 3
	raftutils.ShutdownNode(nodes[3])
	delete(nodes, 3)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)

	// Node 2 and Node 3 should be listed as Reachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[2].Config.ID].Status.Reachability != api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 2 to be reachable")
		}
		if members[nodes[3].Config.ID].Status.Reachability != api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 3 to be reachable")
		}
		return nil
	}))

	// Removing node 3 should succeed
	ctx, _ = context.WithTimeout(context.Background(), 10*time.Second)
	err = nodes[1].RemoveMember(ctx, nodes[3].Config.ID)
	assert.NoError(t, err)
	members = nodes[1].GetMemberlist()
	assert.Nil(t, members[nodes[3].Config.ID])
	assert.Equal(t, len(members), 2)

	// Removing node 2 should succeed
	ctx, _ = context.WithTimeout(context.Background(), 10*time.Second)
	err = nodes[1].RemoveMember(ctx, nodes[2].Config.ID)
	assert.NoError(t, err)
	members = nodes[1].GetMemberlist()
	assert.Nil(t, members[nodes[2].Config.ID])
	assert.Equal(t, len(members), 1)
}
Exemple #22
0
func TestRaftSnapshotRestart(t *testing.T) {
	t.Parallel()

	// Bring up a 3 node cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: 10, LogEntriesForSlowFollowers: 0})
	defer raftutils.TeardownCluster(t, nodes)

	nodeIDs := []string{"id1", "id2", "id3", "id4", "id5", "id6", "id7"}
	values := make([]*api.Node, len(nodeIDs))

	// Propose 3 values
	var err error
	for i, nodeID := range nodeIDs[:3] {
		values[i], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeID)
		assert.NoError(t, err, "failed to propose value")
	}

	// Take down node 3
	nodes[3].Server.Stop()
	nodes[3].Shutdown()

	// Propose a 4th value before the snapshot
	values[3], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[3])
	assert.NoError(t, err, "failed to propose value")

	// Remaining nodes shouldn't have snapshot files yet
	for _, node := range []*raftutils.TestNode{nodes[1], nodes[2]} {
		dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap"))
		assert.NoError(t, err)
		assert.Len(t, dirents, 0)
	}

	// Add a node to the cluster before the snapshot. This is the event
	// that triggers the snapshot.
	nodes[4] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc)
	raftutils.WaitForCluster(t, clockSource, map[uint64]*raftutils.TestNode{1: nodes[1], 2: nodes[2], 4: nodes[4]})

	// Remaining nodes should now have a snapshot file
	for nodeIdx, node := range []*raftutils.TestNode{nodes[1], nodes[2]} {
		assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
			dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap"))
			if err != nil {
				return err
			}
			if len(dirents) != 1 {
				return fmt.Errorf("expected 1 snapshot, found %d on node %d", len(dirents), nodeIdx+1)
			}
			return nil
		}))
	}
	raftutils.CheckValuesOnNodes(t, clockSource, map[uint64]*raftutils.TestNode{1: nodes[1], 2: nodes[2]}, nodeIDs[:4], values[:4])

	// Propose a 5th value
	values[4], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[4])
	require.NoError(t, err)

	// Add another node to the cluster
	nodes[5] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc)
	raftutils.WaitForCluster(t, clockSource, map[uint64]*raftutils.TestNode{1: nodes[1], 2: nodes[2], 4: nodes[4], 5: nodes[5]})

	// New node should get a copy of the snapshot
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		dirents, err := ioutil.ReadDir(filepath.Join(nodes[5].StateDir, "snap"))
		if err != nil {
			return err
		}
		if len(dirents) != 1 {
			return fmt.Errorf("expected 1 snapshot, found %d on new node", len(dirents))
		}
		return nil
	}))

	dirents, err := ioutil.ReadDir(filepath.Join(nodes[5].StateDir, "snap"))
	assert.NoError(t, err)
	assert.Len(t, dirents, 1)
	raftutils.CheckValuesOnNodes(t, clockSource, map[uint64]*raftutils.TestNode{1: nodes[1], 2: nodes[2]}, nodeIDs[:5], values[:5])

	// It should know about the other nodes, including the one that was just added
	stripMembers := func(memberList map[uint64]*api.RaftMember) map[uint64]*api.RaftMember {
		raftNodes := make(map[uint64]*api.RaftMember)
		for k, v := range memberList {
			raftNodes[k] = &api.RaftMember{
				RaftID: v.RaftID,
				Addr:   v.Addr,
			}
		}
		return raftNodes
	}
	assert.Equal(t, stripMembers(nodes[1].GetMemberlist()), stripMembers(nodes[4].GetMemberlist()))

	// Restart node 3
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Node 3 should know about other nodes, including the new one
	assert.Len(t, nodes[3].GetMemberlist(), 5)
	assert.Equal(t, stripMembers(nodes[1].GetMemberlist()), stripMembers(nodes[3].GetMemberlist()))

	// Propose yet another value, to make sure the rejoined node is still
	// receiving new logs
	values[5], err = raftutils.ProposeValue(t, raftutils.Leader(nodes), DefaultProposalTime, nodeIDs[5])
	require.NoError(t, err)

	// All nodes should have all the data
	raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs[:6], values[:6])

	// Restart node 3 again. It should load the snapshot.
	nodes[3].Server.Stop()
	nodes[3].Shutdown()
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)
	raftutils.WaitForCluster(t, clockSource, nodes)

	assert.Len(t, nodes[3].GetMemberlist(), 5)
	assert.Equal(t, stripMembers(nodes[1].GetMemberlist()), stripMembers(nodes[3].GetMemberlist()))
	raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs[:6], values[:6])

	// Propose again. Just to check consensus after this latest restart.
	values[6], err = raftutils.ProposeValue(t, raftutils.Leader(nodes), DefaultProposalTime, nodeIDs[6])
	require.NoError(t, err)
	raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values)
}
Exemple #23
0
func TestRaftSnapshot(t *testing.T) {
	t.Parallel()

	// Bring up a 3 node cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: 9, LogEntriesForSlowFollowers: 0})
	defer raftutils.TeardownCluster(t, nodes)

	nodeIDs := []string{"id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9", "id10", "id11", "id12"}
	values := make([]*api.Node, len(nodeIDs))
	snapshotFilenames := make(map[uint64]string, 4)

	// Propose 3 values
	var err error
	for i, nodeID := range nodeIDs[:3] {
		values[i], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeID)
		assert.NoError(t, err, "failed to propose value")
	}

	// None of the nodes should have snapshot files yet
	for _, node := range nodes {
		dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap"))
		assert.NoError(t, err)
		assert.Len(t, dirents, 0)
	}

	// Check all nodes have all the data.
	// This also acts as a synchronization point so that the next value we
	// propose will arrive as a separate message to the raft state machine,
	// and it is guaranteed to have the right cluster settings when
	// deciding whether to create a new snapshot.
	raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs[:3], values)

	// Propose a 4th value
	values[3], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[3])
	assert.NoError(t, err, "failed to propose value")

	// All nodes should now have a snapshot file
	for nodeID, node := range nodes {
		assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
			dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap"))
			if err != nil {
				return err
			}
			if len(dirents) != 1 {
				return fmt.Errorf("expected 1 snapshot, found %d", len(dirents))
			}
			snapshotFilenames[nodeID] = dirents[0].Name()
			return nil
		}))
	}

	// Add a node to the cluster
	raftutils.AddRaftNode(t, clockSource, nodes, tc)

	// It should get a copy of the snapshot
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		dirents, err := ioutil.ReadDir(filepath.Join(nodes[4].StateDir, "snap"))
		if err != nil {
			return err
		}
		if len(dirents) != 1 {
			return fmt.Errorf("expected 1 snapshot, found %d on new node", len(dirents))
		}
		snapshotFilenames[4] = dirents[0].Name()
		return nil
	}))

	// It should know about the other nodes
	stripMembers := func(memberList map[uint64]*api.RaftMember) map[uint64]*api.RaftMember {
		raftNodes := make(map[uint64]*api.RaftMember)
		for k, v := range memberList {
			raftNodes[k] = &api.RaftMember{
				RaftID: v.RaftID,
				Addr:   v.Addr,
			}
		}
		return raftNodes
	}
	assert.Equal(t, stripMembers(nodes[1].GetMemberlist()), stripMembers(nodes[4].GetMemberlist()))

	// All nodes should have all the data
	raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs[:4], values)

	// Propose more values to provoke a second snapshot
	for i := 4; i != len(nodeIDs); i++ {
		values[i], err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[i])
		assert.NoError(t, err, "failed to propose value")
	}

	// All nodes should have a snapshot under a *different* name
	for nodeID, node := range nodes {
		assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
			dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap"))
			if err != nil {
				return err
			}
			if len(dirents) != 1 {
				return fmt.Errorf("expected 1 snapshot, found %d on node %d", len(dirents), nodeID)
			}
			if dirents[0].Name() == snapshotFilenames[nodeID] {
				return fmt.Errorf("snapshot %s did not get replaced", snapshotFilenames[nodeID])
			}
			return nil
		}))
	}

	// All nodes should have all the data
	raftutils.CheckValuesOnNodes(t, clockSource, nodes, nodeIDs, values)
}
Exemple #24
0
func TestUpdateNodeDemote(t *testing.T) {
	tc := cautils.NewTestCA(nil, cautils.AcceptancePolicy(true, true, ""))
	ts := newTestServer(t)

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Assign one of the raft node to the test server
	ts.Server.raft = nodes[1].Node
	ts.Server.store = nodes[1].MemoryStore()

	// Create a node object for each of the managers
	assert.NoError(t, nodes[1].MemoryStore().Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateNode(tx, &api.Node{
			ID: nodes[1].SecurityConfig.ClientTLSCreds.NodeID(),
			Spec: api.NodeSpec{
				Role:       api.NodeRoleManager,
				Membership: api.NodeMembershipAccepted,
			},
		}))
		assert.NoError(t, store.CreateNode(tx, &api.Node{
			ID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID(),
			Spec: api.NodeSpec{
				Role:       api.NodeRoleManager,
				Membership: api.NodeMembershipAccepted,
			},
		}))
		assert.NoError(t, store.CreateNode(tx, &api.Node{
			ID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID(),
			Spec: api.NodeSpec{
				Role:       api.NodeRoleManager,
				Membership: api.NodeMembershipAccepted,
			},
		}))
		return nil
	}))

	// Stop Node 3 (1 node out of 3)
	nodes[3].Server.Stop()
	nodes[3].Shutdown()

	// Node 3 should be listed as Unreachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_REACHABLE {
			return fmt.Errorf("expected node 3 to be unreachable")
		}
		return nil
	}))

	// Try to demote Node 2, this should fail because of the quorum safeguard
	r, err := ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID()})
	assert.NoError(t, err)
	spec := r.Node.Spec.Copy()
	spec.Role = api.NodeRoleWorker
	version := &r.Node.Meta.Version
	_, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{
		NodeID:      nodes[2].SecurityConfig.ClientTLSCreds.NodeID(),
		Spec:        spec,
		NodeVersion: version,
	})
	assert.Error(t, err)
	assert.Equal(t, codes.FailedPrecondition, grpc.Code(err))

	// Restart Node 3
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Node 3 should be listed as Reachable
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 3 {
			return fmt.Errorf("expected 3 nodes, got %d", len(members))
		}
		if members[nodes[3].Config.ID].Status.Reachability == api.RaftMemberStatus_UNREACHABLE {
			return fmt.Errorf("expected node 3 to be reachable")
		}
		return nil
	}))

	// Try to demote Node 3, this should succeed
	r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[3].SecurityConfig.ClientTLSCreds.NodeID()})
	assert.NoError(t, err)
	spec = r.Node.Spec.Copy()
	spec.Role = api.NodeRoleWorker
	version = &r.Node.Meta.Version
	_, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{
		NodeID:      nodes[3].SecurityConfig.ClientTLSCreds.NodeID(),
		Spec:        spec,
		NodeVersion: version,
	})
	assert.NoError(t, err)

	newCluster := map[uint64]*raftutils.TestNode{
		1: nodes[1],
		2: nodes[2],
	}

	raftutils.WaitForCluster(t, clockSource, newCluster)

	// Server should list 2 members
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 2 {
			return fmt.Errorf("expected 2 nodes, got %d", len(members))
		}
		return nil
	}))

	// Try to demote Node 2
	r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[2].SecurityConfig.ClientTLSCreds.NodeID()})
	assert.NoError(t, err)
	spec = r.Node.Spec.Copy()
	spec.Role = api.NodeRoleWorker
	version = &r.Node.Meta.Version
	_, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{
		NodeID:      nodes[2].SecurityConfig.ClientTLSCreds.NodeID(),
		Spec:        spec,
		NodeVersion: version,
	})
	assert.NoError(t, err)

	newCluster = map[uint64]*raftutils.TestNode{
		1: nodes[1],
	}

	raftutils.WaitForCluster(t, clockSource, newCluster)

	// New server should list 1 member
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		members := nodes[1].GetMemberlist()
		if len(members) != 1 {
			return fmt.Errorf("expected 1 node, got %d", len(members))
		}
		return nil
	}))

	// Make sure we can't demote the last manager.
	r, err = ts.Client.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodes[1].SecurityConfig.ClientTLSCreds.NodeID()})
	assert.NoError(t, err)
	spec = r.Node.Spec.Copy()
	spec.Role = api.NodeRoleWorker
	version = &r.Node.Meta.Version
	_, err = ts.Client.UpdateNode(context.Background(), &api.UpdateNodeRequest{
		NodeID:      nodes[1].SecurityConfig.ClientTLSCreds.NodeID(),
		Spec:        spec,
		NodeVersion: version,
	})
	assert.Error(t, err)
	assert.Equal(t, codes.FailedPrecondition, grpc.Code(err))
}
Exemple #25
0
func TestGCWAL(t *testing.T) {
	t.Parallel()

	// Additional log entries from cluster setup, leader election
	extraLogEntries := 5
	// Number of large entries to propose
	proposals := 8

	// Bring up a 3 node cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: uint64(proposals + extraLogEntries), LogEntriesForSlowFollowers: 0})

	for i := 0; i != proposals; i++ {
		_, err := proposeLargeValue(t, nodes[1], DefaultProposalTime, fmt.Sprintf("id%d", i))
		assert.NoError(t, err, "failed to propose value")
	}

	time.Sleep(250 * time.Millisecond)

	// Snapshot should have been triggered just as the WAL rotated, so
	// both WAL files should be preserved
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		dirents, err := ioutil.ReadDir(filepath.Join(nodes[1].StateDir, "snap"))
		if err != nil {
			return err
		}
		if len(dirents) != 1 {
			return fmt.Errorf("expected 1 snapshot, found %d", len(dirents))
		}

		dirents, err = ioutil.ReadDir(filepath.Join(nodes[1].StateDir, "wal"))
		if err != nil {
			return err
		}
		var walCount int
		for _, f := range dirents {
			if strings.HasSuffix(f.Name(), ".wal") {
				walCount++
			}
		}
		if walCount != 2 {
			return fmt.Errorf("expected 2 WAL files, found %d", walCount)
		}
		return nil
	}))

	raftutils.TeardownCluster(t, nodes)

	// Repeat this test, but trigger the snapshot after the WAL has rotated
	proposals++
	nodes, clockSource = raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: uint64(proposals + extraLogEntries), LogEntriesForSlowFollowers: 0})
	defer raftutils.TeardownCluster(t, nodes)

	for i := 0; i != proposals; i++ {
		_, err := proposeLargeValue(t, nodes[1], DefaultProposalTime, fmt.Sprintf("id%d", i))
		assert.NoError(t, err, "failed to propose value")
	}

	time.Sleep(250 * time.Millisecond)

	// This time only one WAL file should be saved.
	assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
		dirents, err := ioutil.ReadDir(filepath.Join(nodes[1].StateDir, "snap"))
		if err != nil {
			return err
		}

		if len(dirents) != 1 {
			return fmt.Errorf("expected 1 snapshot, found %d", len(dirents))
		}

		dirents, err = ioutil.ReadDir(filepath.Join(nodes[1].StateDir, "wal"))
		if err != nil {
			return err
		}
		var walCount int
		for _, f := range dirents {
			if strings.HasSuffix(f.Name(), ".wal") {
				walCount++
			}
		}
		if walCount != 1 {
			return fmt.Errorf("expected 1 WAL file, found %d", walCount)
		}
		return nil
	}))

	// Restart the whole cluster
	for _, node := range nodes {
		node.Server.Stop()
		node.Shutdown()
	}

	raftutils.AdvanceTicks(clockSource, 5)

	i := 0
	for k, node := range nodes {
		nodes[k] = raftutils.RestartNode(t, clockSource, node, false)
		i++
	}
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Is the data intact after restart?
	for _, node := range nodes {
		assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
			var err error
			node.MemoryStore().View(func(tx store.ReadTx) {
				var allNodes []*api.Node
				allNodes, err = store.FindNodes(tx, store.All)
				if err != nil {
					return
				}
				if len(allNodes) != proposals {
					err = fmt.Errorf("expected %d nodes, got %d", proposals, len(allNodes))
					return
				}
			})
			return err
		}))
	}

	// It should still be possible to propose values
	_, err := raftutils.ProposeValue(t, raftutils.Leader(nodes), DefaultProposalTime, "newnode")
	assert.NoError(t, err, "failed to propose value")

	for _, node := range nodes {
		assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
			var err error
			node.MemoryStore().View(func(tx store.ReadTx) {
				var allNodes []*api.Node
				allNodes, err = store.FindNodes(tx, store.All)
				if err != nil {
					return
				}
				if len(allNodes) != proposals+1 {
					err = fmt.Errorf("expected %d nodes, got %d", proposals, len(allNodes))
					return
				}
			})
			return err
		}))
	}
}
Exemple #26
0
func TestStress(t *testing.T) {
	t.Parallel()

	// Bring up a 5 nodes cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	raftutils.AddRaftNode(t, clockSource, nodes, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// number of nodes that are running
	nup := len(nodes)
	// record of nodes that are down
	idleNodes := map[int]struct{}{}
	// record of ids that proposed successfully or time-out
	pIDs := []string{}

	leader := -1
	for iters := 0; iters < 1000; iters++ {
		// keep proposing new values and killing leader
		for i := 1; i <= 5; i++ {
			if nodes[uint64(i)] != nil {
				id := strconv.Itoa(iters)
				_, err := raftutils.ProposeValue(t, nodes[uint64(i)], id)

				if err == nil {
					pIDs = append(pIDs, id)
					// if propose successfully, at least there are 3 running nodes
					assert.True(t, nup >= 3)
					// only leader can propose value
					assert.True(t, leader == i || leader == -1)
					// update leader
					leader = i
					break
				} else if strings.Contains(err.Error(), "context deadline exceeded") {
					// though it's timing out, we still record this value
					// for it may be proposed successfully and stored in Raft some time later
					pIDs = append(pIDs, id)
				}
			}
		}

		if rand.Intn(100) < 10 {
			// increase clock to make potential election finish quickly
			clockSource.Increment(200 * time.Millisecond)
			time.Sleep(10 * time.Millisecond)
		} else {
			ms := rand.Intn(10)
			clockSource.Increment(time.Duration(ms) * time.Millisecond)
		}

		if leader != -1 {
			// if propose successfully, try to kill a node in random
			s := rand.Intn(5) + 1
			if _, ok := idleNodes[s]; !ok {
				id := uint64(s)
				nodes[id].Server.Stop()
				nodes[id].Shutdown()
				idleNodes[s] = struct{}{}
				nup -= 1
				if s == leader {
					// leader is killed
					leader = -1
				}
			}
		}

		if nup < 3 {
			// if quorum is lost, try to bring back a node
			s := rand.Intn(5) + 1
			if _, ok := idleNodes[s]; ok {
				id := uint64(s)
				nodes[id] = raftutils.RestartNode(t, clockSource, nodes[id], false)
				delete(idleNodes, s)
				nup++
			}
		}
	}

	// bring back all nodes and propose the final value
	for i := range idleNodes {
		id := uint64(i)
		nodes[id] = raftutils.RestartNode(t, clockSource, nodes[id], false)
	}
	raftutils.WaitForCluster(t, clockSource, nodes)
	id := strconv.Itoa(1000)
	val, err := raftutils.ProposeValue(t, raftutils.Leader(nodes), id)
	assert.NoError(t, err, "failed to propose value")
	pIDs = append(pIDs, id)

	// increase clock to make cluster stable
	time.Sleep(500 * time.Millisecond)
	clockSource.Increment(500 * time.Millisecond)

	ids, values := raftutils.GetAllValuesOnNode(t, clockSource, nodes[1])

	// since cluster is stable, final value must be in the raft store
	find := false
	for _, value := range values {
		if reflect.DeepEqual(value, val) {
			find = true
			break
		}
	}
	assert.True(t, find)

	// all nodes must have the same value
	raftutils.CheckValuesOnNodes(t, clockSource, nodes, ids, values)

	// ids should be a subset of pIDs
	for _, id := range ids {
		find = false
		for _, pid := range pIDs {
			if id == pid {
				find = true
				break
			}
		}
		assert.True(t, find)
	}
}
Exemple #27
0
func TestRaftSnapshotForceNewCluster(t *testing.T) {
	t.Parallel()

	// Bring up a 3 node cluster
	nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: 10, LogEntriesForSlowFollowers: 0})
	defer raftutils.TeardownCluster(t, nodes)

	nodeIDs := []string{"id1", "id2", "id3", "id4", "id5"}

	// Propose 3 values.
	for _, nodeID := range nodeIDs[:3] {
		_, err := raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeID)
		assert.NoError(t, err, "failed to propose value")
	}

	// Remove one of the original nodes

	// Use gRPC instead of calling handler directly because of
	// authorization check.
	cc, err := dial(nodes[1], nodes[1].Address)
	assert.NoError(t, err)
	raftClient := api.NewRaftMembershipClient(cc)
	defer cc.Close()
	ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
	resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[2].Config.ID}})
	assert.NoError(t, err, "error sending message to leave the raft")
	assert.NotNil(t, resp, "leave response message is nil")

	raftutils.ShutdownNode(nodes[2])
	delete(nodes, 2)

	// Nodes shouldn't have snapshot files yet
	for _, node := range nodes {
		dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap-v3-encrypted"))
		assert.NoError(t, err)
		assert.Len(t, dirents, 0)
	}

	// Trigger a snapshot, with a 4th proposal
	_, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[3])
	assert.NoError(t, err, "failed to propose value")

	// Nodes should now have a snapshot file
	for nodeIdx, node := range nodes {
		assert.NoError(t, raftutils.PollFunc(clockSource, func() error {
			dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap-v3-encrypted"))
			if err != nil {
				return err
			}
			if len(dirents) != 1 {
				return fmt.Errorf("expected 1 snapshot, found %d on node %d", len(dirents), nodeIdx+1)
			}
			return nil
		}))
	}

	// Join another node
	nodes[4] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// Only restart the first node with force-new-cluster option
	nodes[1].Server.Stop()
	nodes[1].ShutdownRaft()
	nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], true)
	delete(nodes, 3)
	delete(nodes, 4)
	raftutils.WaitForCluster(t, clockSource, nodes)

	// The memberlist should contain exactly one node (self)
	memberlist := nodes[1].GetMemberlist()
	require.Len(t, memberlist, 1)

	// Propose a 5th value
	_, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[4])
	require.NoError(t, err)
}