func TestRaftJoinTwice(t *testing.T) { t.Parallel() nodes, _ := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // Node 3 tries to join again // Use gRPC instead of calling handler directly because of // authorization check. cc, err := dial(nodes[3], nodes[1].Address) assert.NoError(t, err) raftClient := api.NewRaftMembershipClient(cc) defer cc.Close() ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) _, err = raftClient.Join(ctx, &api.JoinRequest{}) assert.Error(t, err, "expected error on duplicate Join") assert.Equal(t, grpc.Code(err), codes.AlreadyExists) assert.Equal(t, grpc.ErrorDesc(err), "a raft member with this node ID already exists") }
func TestRaftFollowerLeave(t *testing.T) { t.Parallel() // Bring up a 5 nodes cluster nodes, clockSource := raftutils.NewRaftCluster(t, tc) raftutils.AddRaftNode(t, clockSource, nodes, tc) raftutils.AddRaftNode(t, clockSource, nodes, tc) defer raftutils.TeardownCluster(t, nodes) // Node 5 leaves the cluster // Use gRPC instead of calling handler directly because of // authorization check. cc, err := dial(nodes[1], nodes[1].Address) assert.NoError(t, err) raftClient := api.NewRaftMembershipClient(cc) defer cc.Close() ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[5].Config.ID}}) assert.NoError(t, err, "error sending message to leave the raft") assert.NotNil(t, resp, "leave response message is nil") raftutils.ShutdownNode(nodes[5]) delete(nodes, 5) raftutils.WaitForPeerNumber(t, clockSource, nodes, 4) // Propose a value value, err := raftutils.ProposeValue(t, nodes[1], DefaultProposalTime) assert.NoError(t, err, "failed to propose value") // Value should be replicated on every node raftutils.CheckValue(t, clockSource, nodes[1], value) assert.Len(t, nodes[1].GetMemberlist(), 4) raftutils.CheckValue(t, clockSource, nodes[2], value) assert.Len(t, nodes[2].GetMemberlist(), 4) raftutils.CheckValue(t, clockSource, nodes[3], value) assert.Len(t, nodes[3].GetMemberlist(), 4) raftutils.CheckValue(t, clockSource, nodes[4], value) assert.Len(t, nodes[4].GetMemberlist(), 4) }
// JoinAndStart joins and starts the raft server func (n *Node) JoinAndStart(ctx context.Context) (err error) { ctx, cancel := n.WithContext(ctx) defer func() { cancel() if err != nil { n.done() } }() loadAndStartErr := n.loadAndStart(ctx, n.opts.ForceNewCluster) if loadAndStartErr != nil && loadAndStartErr != errNoWAL { return loadAndStartErr } snapshot, err := n.raftStore.Snapshot() // Snapshot never returns an error if err != nil { panic("could not get snapshot of raft store") } n.confState = snapshot.Metadata.ConfState n.appliedIndex = snapshot.Metadata.Index n.snapshotIndex = snapshot.Metadata.Index if loadAndStartErr == errNoWAL { if n.opts.JoinAddr != "" { c, err := n.ConnectToMember(n.opts.JoinAddr, 10*time.Second) if err != nil { return err } client := api.NewRaftMembershipClient(c.Conn) defer func() { _ = c.Conn.Close() }() joinCtx, joinCancel := context.WithTimeout(ctx, 10*time.Second) defer joinCancel() resp, err := client.Join(joinCtx, &api.JoinRequest{ Addr: n.opts.Addr, }) if err != nil { return err } n.Config.ID = resp.RaftID if _, err := n.createWAL(n.opts.ID); err != nil { return err } n.raftNode = raft.StartNode(n.Config, []raft.Peer{}) if err := n.registerNodes(resp.Members); err != nil { if walErr := n.wal.Close(); err != nil { log.G(ctx).WithError(walErr).Error("raft: error closing WAL") } return err } } else { // First member in the cluster, self-assign ID n.Config.ID = uint64(rand.Int63()) + 1 peer, err := n.createWAL(n.opts.ID) if err != nil { return err } n.raftNode = raft.StartNode(n.Config, []raft.Peer{peer}) n.campaignWhenAble = true } atomic.StoreUint32(&n.isMember, 1) return nil } if n.opts.JoinAddr != "" { log.G(ctx).Warning("ignoring request to join cluster, because raft state already exists") } n.campaignWhenAble = true n.raftNode = raft.RestartNode(n.Config) atomic.StoreUint32(&n.isMember, 1) return nil }
func TestRaftLeaderLeave(t *testing.T) { t.Parallel() nodes, clockSource := raftutils.NewRaftCluster(t, tc) // node 1 is the leader assert.Equal(t, nodes[1].Leader(), nodes[1].Config.ID) // Try to leave the raft // Use gRPC instead of calling handler directly because of // authorization check. client, err := nodes[1].ConnectToMember(nodes[1].Address, 10*time.Second) assert.NoError(t, err) defer client.Conn.Close() raftClient := api.NewRaftMembershipClient(client.Conn) ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[1].Config.ID}}) assert.NoError(t, err, "error sending message to leave the raft") assert.NotNil(t, resp, "leave response message is nil") newCluster := map[uint64]*raftutils.TestNode{ 2: nodes[2], 3: nodes[3], } // Wait for election tick raftutils.WaitForCluster(t, clockSource, newCluster) // Leader should not be 1 assert.NotEqual(t, nodes[2].Leader(), nodes[1].Config.ID) assert.Equal(t, nodes[2].Leader(), nodes[3].Leader()) leader := nodes[2].Leader() // Find the leader node and a follower node var ( leaderNode *raftutils.TestNode followerNode *raftutils.TestNode ) for i, n := range nodes { if n.Config.ID == leader { leaderNode = n if i == 2 { followerNode = nodes[3] } else { followerNode = nodes[2] } } } require.NotNil(t, leaderNode) require.NotNil(t, followerNode) // Propose a value value, err := raftutils.ProposeValue(t, leaderNode) assert.NoError(t, err, "failed to propose value") // The value should be replicated on all remaining nodes raftutils.CheckValue(t, clockSource, leaderNode, value) assert.Equal(t, len(leaderNode.GetMemberlist()), 2) raftutils.CheckValue(t, clockSource, followerNode, value) assert.Equal(t, len(followerNode.GetMemberlist()), 2) raftutils.TeardownCluster(t, newCluster) }
// JoinAndStart joins and starts the raft server func (n *Node) JoinAndStart() error { loadAndStartErr := n.loadAndStart(n.Ctx, n.opts.ForceNewCluster) if loadAndStartErr != nil && loadAndStartErr != errNoWAL { n.ticker.Stop() return loadAndStartErr } snapshot, err := n.raftStore.Snapshot() // Snapshot never returns an error if err != nil { panic("could not get snapshot of raft store") } n.confState = snapshot.Metadata.ConfState n.appliedIndex = snapshot.Metadata.Index n.snapshotIndex = snapshot.Metadata.Index if loadAndStartErr == errNoWAL { if n.joinAddr != "" { c, err := n.ConnectToMember(n.joinAddr, 10*time.Second) if err != nil { return err } client := api.NewRaftMembershipClient(c.Conn) defer func() { _ = c.Conn.Close() }() ctx, cancel := context.WithTimeout(n.Ctx, 10*time.Second) defer cancel() resp, err := client.Join(ctx, &api.JoinRequest{ Addr: n.Address, }) if err != nil { return err } n.Config.ID = resp.RaftID if _, err := n.createWAL(n.opts.ID); err != nil { return err } n.Node = raft.StartNode(n.Config, []raft.Peer{}) if err := n.registerNodes(resp.Members); err != nil { return err } } else { // First member in the cluster, self-assign ID n.Config.ID = uint64(rand.Int63()) + 1 peer, err := n.createWAL(n.opts.ID) if err != nil { return err } n.Node = raft.StartNode(n.Config, []raft.Peer{peer}) if err := n.Campaign(n.Ctx); err != nil { return err } } atomic.StoreUint32(&n.isMember, 1) return nil } if n.joinAddr != "" { n.Config.Logger.Warning("ignoring request to join cluster, because raft state already exists") } n.Node = raft.RestartNode(n.Config) atomic.StoreUint32(&n.isMember, 1) return nil }
func TestManager(t *testing.T) { ctx := context.TODO() store := store.NewMemoryStore(nil) assert.NotNil(t, store) temp, err := ioutil.TempFile("", "test-socket") assert.NoError(t, err) assert.NoError(t, temp.Close()) assert.NoError(t, os.Remove(temp.Name())) defer os.RemoveAll(temp.Name()) lunix, err := net.Listen("unix", temp.Name()) assert.NoError(t, err) ltcp, err := net.Listen("tcp", "127.0.0.1:0") assert.NoError(t, err) stateDir, err := ioutil.TempDir("", "test-raft") assert.NoError(t, err) defer os.RemoveAll(stateDir) tc := testutils.NewTestCA(t) defer tc.Stop() agentSecurityConfig, err := tc.NewNodeConfig(ca.AgentRole) assert.NoError(t, err) agentDiffOrgSecurityConfig, err := tc.NewNodeConfigOrg(ca.AgentRole, "another-org") assert.NoError(t, err) managerSecurityConfig, err := tc.NewNodeConfig(ca.ManagerRole) assert.NoError(t, err) m, err := manager.New(&manager.Config{ ProtoListener: map[string]net.Listener{"unix": lunix, "tcp": ltcp}, StateDir: stateDir, SecurityConfig: managerSecurityConfig, }) assert.NoError(t, err) assert.NotNil(t, m) done := make(chan error) defer close(done) go func() { done <- m.Run(ctx) }() opts := []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(agentSecurityConfig.ClientTLSCreds), } conn, err := grpc.Dial(ltcp.Addr().String(), opts...) assert.NoError(t, err) defer func() { assert.NoError(t, conn.Close()) }() // We have to send a dummy request to verify if the connection is actually up. client := api.NewDispatcherClient(conn) _, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{}) assert.Equal(t, dispatcher.ErrNodeNotRegistered.Error(), grpc.ErrorDesc(err)) // Try to have a client in a different org access this manager opts = []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(agentDiffOrgSecurityConfig.ClientTLSCreds), } conn2, err := grpc.Dial(ltcp.Addr().String(), opts...) assert.NoError(t, err) defer func() { assert.NoError(t, conn2.Close()) }() // We have to send a dummy request to verify if the connection is actually up. client = api.NewDispatcherClient(conn2) _, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{}) assert.Contains(t, grpc.ErrorDesc(err), "Permission denied: unauthorized peer role: rpc error: code = 7 desc = Permission denied: remote certificate not part of organization") // Verify that requests to the various GRPC services running on TCP // are rejected if they don't have certs. opts = []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true})), } noCertConn, err := grpc.Dial(ltcp.Addr().String(), opts...) assert.NoError(t, err) defer func() { assert.NoError(t, noCertConn.Close()) }() client = api.NewDispatcherClient(noCertConn) _, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{}) assert.EqualError(t, err, "rpc error: code = 7 desc = Permission denied: unauthorized peer role: rpc error: code = 7 desc = no client certificates in request") controlClient := api.NewControlClient(noCertConn) _, err = controlClient.ListNodes(context.Background(), &api.ListNodesRequest{}) assert.EqualError(t, err, "rpc error: code = 7 desc = Permission denied: unauthorized peer role: rpc error: code = 7 desc = no client certificates in request") raftClient := api.NewRaftMembershipClient(noCertConn) _, err = raftClient.Join(context.Background(), &api.JoinRequest{}) assert.EqualError(t, err, "rpc error: code = 7 desc = Permission denied: unauthorized peer role: rpc error: code = 7 desc = no client certificates in request") m.Stop(ctx) // After stopping we should MAY receive an error from ListenAndServe if // all this happened before WaitForLeader completed, so don't check the // error. <-done }
func TestManager(t *testing.T) { ctx := context.Background() temp, err := ioutil.TempFile("", "test-socket") assert.NoError(t, err) assert.NoError(t, temp.Close()) assert.NoError(t, os.Remove(temp.Name())) defer os.RemoveAll(temp.Name()) stateDir, err := ioutil.TempDir("", "test-raft") assert.NoError(t, err) defer os.RemoveAll(stateDir) tc := testutils.NewTestCA(t, func(p ca.CertPaths) *ca.KeyReadWriter { return ca.NewKeyReadWriter(p, []byte("kek"), nil) }) defer tc.Stop() agentSecurityConfig, err := tc.NewNodeConfig(ca.WorkerRole) assert.NoError(t, err) agentDiffOrgSecurityConfig, err := tc.NewNodeConfigOrg(ca.WorkerRole, "another-org") assert.NoError(t, err) managerSecurityConfig, err := tc.NewNodeConfig(ca.ManagerRole) assert.NoError(t, err) m, err := New(&Config{ RemoteAPI: RemoteAddrs{ListenAddr: "127.0.0.1:0"}, ControlAPI: temp.Name(), StateDir: stateDir, SecurityConfig: managerSecurityConfig, AutoLockManagers: true, UnlockKey: []byte("kek"), }) assert.NoError(t, err) assert.NotNil(t, m) tcpAddr := m.Addr() done := make(chan error) defer close(done) go func() { done <- m.Run(ctx) }() opts := []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(agentSecurityConfig.ClientTLSCreds), } conn, err := grpc.Dial(tcpAddr, opts...) assert.NoError(t, err) defer func() { assert.NoError(t, conn.Close()) }() // We have to send a dummy request to verify if the connection is actually up. client := api.NewDispatcherClient(conn) _, err = client.Heartbeat(ctx, &api.HeartbeatRequest{}) assert.Equal(t, dispatcher.ErrNodeNotRegistered.Error(), grpc.ErrorDesc(err)) _, err = client.Session(ctx, &api.SessionRequest{}) assert.NoError(t, err) // Try to have a client in a different org access this manager opts = []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(agentDiffOrgSecurityConfig.ClientTLSCreds), } conn2, err := grpc.Dial(tcpAddr, opts...) assert.NoError(t, err) defer func() { assert.NoError(t, conn2.Close()) }() client = api.NewDispatcherClient(conn2) _, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{}) assert.Contains(t, grpc.ErrorDesc(err), "Permission denied: unauthorized peer role: rpc error: code = 7 desc = Permission denied: remote certificate not part of organization") // Verify that requests to the various GRPC services running on TCP // are rejected if they don't have certs. opts = []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true})), } noCertConn, err := grpc.Dial(tcpAddr, opts...) assert.NoError(t, err) defer func() { assert.NoError(t, noCertConn.Close()) }() client = api.NewDispatcherClient(noCertConn) _, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{}) assert.EqualError(t, err, "rpc error: code = 7 desc = Permission denied: unauthorized peer role: rpc error: code = 7 desc = no client certificates in request") controlClient := api.NewControlClient(noCertConn) _, err = controlClient.ListNodes(context.Background(), &api.ListNodesRequest{}) assert.EqualError(t, err, "rpc error: code = 7 desc = Permission denied: unauthorized peer role: rpc error: code = 7 desc = no client certificates in request") raftClient := api.NewRaftMembershipClient(noCertConn) _, err = raftClient.Join(context.Background(), &api.JoinRequest{}) assert.EqualError(t, err, "rpc error: code = 7 desc = Permission denied: unauthorized peer role: rpc error: code = 7 desc = no client certificates in request") opts = []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(managerSecurityConfig.ClientTLSCreds), } controlConn, err := grpc.Dial(tcpAddr, opts...) assert.NoError(t, err) defer func() { assert.NoError(t, controlConn.Close()) }() // check that the kek is added to the config var cluster api.Cluster m.raftNode.MemoryStore().View(func(tx store.ReadTx) { clusters, err := store.FindClusters(tx, store.All) require.NoError(t, err) require.Len(t, clusters, 1) cluster = *clusters[0] }) require.NotNil(t, cluster) require.Len(t, cluster.UnlockKeys, 1) require.Equal(t, &api.EncryptionKey{ Subsystem: ca.ManagerRole, Key: []byte("kek"), }, cluster.UnlockKeys[0]) // Test removal of the agent node agentID := agentSecurityConfig.ClientTLSCreds.NodeID() assert.NoError(t, m.raftNode.MemoryStore().Update(func(tx store.Tx) error { return store.CreateNode(tx, &api.Node{ ID: agentID, Certificate: api.Certificate{ Role: api.NodeRoleWorker, CN: agentID, }, }, ) })) controlClient = api.NewControlClient(controlConn) _, err = controlClient.RemoveNode(context.Background(), &api.RemoveNodeRequest{ NodeID: agentID, Force: true, }, ) assert.NoError(t, err) client = api.NewDispatcherClient(conn) _, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{}) assert.Contains(t, grpc.ErrorDesc(err), "removed from swarm") m.Stop(ctx) // After stopping we should MAY receive an error from ListenAndServe if // all this happened before WaitForLeader completed, so don't check the // error. <-done }
// NewNode generates a new Raft node func NewNode(ctx context.Context, opts NewNodeOptions) (*Node, error) { cfg := opts.Config if cfg == nil { cfg = DefaultNodeConfig() } if opts.TickInterval == 0 { opts.TickInterval = time.Second } raftStore := raft.NewMemoryStorage() ctx, cancel := context.WithCancel(ctx) n := &Node{ Ctx: ctx, cancel: cancel, cluster: membership.NewCluster(), tlsCredentials: opts.TLSCredentials, raftStore: raftStore, Address: opts.Addr, Config: &raft.Config{ ElectionTick: cfg.ElectionTick, HeartbeatTick: cfg.HeartbeatTick, Storage: raftStore, MaxSizePerMsg: cfg.MaxSizePerMsg, MaxInflightMsgs: cfg.MaxInflightMsgs, Logger: cfg.Logger, }, forceNewCluster: opts.ForceNewCluster, stopCh: make(chan struct{}), doneCh: make(chan struct{}), StateDir: opts.StateDir, joinAddr: opts.JoinAddr, sendTimeout: 2 * time.Second, leadershipBroadcast: events.NewBroadcaster(), } n.memoryStore = store.NewMemoryStore(n) if opts.ClockSource == nil { n.ticker = clock.NewClock().NewTicker(opts.TickInterval) } else { n.ticker = opts.ClockSource.NewTicker(opts.TickInterval) } if opts.SendTimeout != 0 { n.sendTimeout = opts.SendTimeout } loadAndStartErr := n.loadAndStart(ctx, opts.ForceNewCluster) if loadAndStartErr != nil && loadAndStartErr != errNoWAL { n.ticker.Stop() return nil, loadAndStartErr } snapshot, err := raftStore.Snapshot() // Snapshot never returns an error if err != nil { panic("could not get snapshot of raft store") } n.confState = snapshot.Metadata.ConfState n.appliedIndex = snapshot.Metadata.Index n.snapshotIndex = snapshot.Metadata.Index n.reqIDGen = idutil.NewGenerator(uint16(n.Config.ID), time.Now()) n.wait = newWait() if loadAndStartErr == errNoWAL { if n.joinAddr != "" { c, err := n.ConnectToMember(n.joinAddr, 10*time.Second) if err != nil { return nil, err } client := api.NewRaftMembershipClient(c.Conn) defer func() { _ = c.Conn.Close() }() ctx, cancel := context.WithTimeout(n.Ctx, 10*time.Second) defer cancel() resp, err := client.Join(ctx, &api.JoinRequest{ Addr: n.Address, }) if err != nil { return nil, err } n.Config.ID = resp.RaftID if _, err := n.createWAL(opts.ID); err != nil { return nil, err } n.Node = raft.StartNode(n.Config, []raft.Peer{}) if err := n.registerNodes(resp.Members); err != nil { return nil, err } } else { // First member in the cluster, self-assign ID n.Config.ID = uint64(rand.Int63()) + 1 peer, err := n.createWAL(opts.ID) if err != nil { return nil, err } n.Node = raft.StartNode(n.Config, []raft.Peer{peer}) if err := n.Campaign(n.Ctx); err != nil { return nil, err } } return n, nil } if n.joinAddr != "" { n.Config.Logger.Warning("ignoring request to join cluster, because raft state already exists") } n.Node = raft.RestartNode(n.Config) return n, nil }
func TestRaftSnapshotForceNewCluster(t *testing.T) { t.Parallel() // Bring up a 3 node cluster nodes, clockSource := raftutils.NewRaftCluster(t, tc, &api.RaftConfig{SnapshotInterval: 10, LogEntriesForSlowFollowers: 0}) defer raftutils.TeardownCluster(t, nodes) nodeIDs := []string{"id1", "id2", "id3", "id4", "id5"} // Propose 3 values. for _, nodeID := range nodeIDs[:3] { _, err := raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeID) assert.NoError(t, err, "failed to propose value") } // Remove one of the original nodes // Use gRPC instead of calling handler directly because of // authorization check. cc, err := dial(nodes[1], nodes[1].Address) assert.NoError(t, err) raftClient := api.NewRaftMembershipClient(cc) defer cc.Close() ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[2].Config.ID}}) assert.NoError(t, err, "error sending message to leave the raft") assert.NotNil(t, resp, "leave response message is nil") raftutils.ShutdownNode(nodes[2]) delete(nodes, 2) // Nodes shouldn't have snapshot files yet for _, node := range nodes { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap-v3-encrypted")) assert.NoError(t, err) assert.Len(t, dirents, 0) } // Trigger a snapshot, with a 4th proposal _, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[3]) assert.NoError(t, err, "failed to propose value") // Nodes should now have a snapshot file for nodeIdx, node := range nodes { assert.NoError(t, raftutils.PollFunc(clockSource, func() error { dirents, err := ioutil.ReadDir(filepath.Join(node.StateDir, "snap-v3-encrypted")) if err != nil { return err } if len(dirents) != 1 { return fmt.Errorf("expected 1 snapshot, found %d on node %d", len(dirents), nodeIdx+1) } return nil })) } // Join another node nodes[4] = raftutils.NewJoinNode(t, clockSource, nodes[1].Address, tc) raftutils.WaitForCluster(t, clockSource, nodes) // Only restart the first node with force-new-cluster option nodes[1].Server.Stop() nodes[1].ShutdownRaft() nodes[1] = raftutils.RestartNode(t, clockSource, nodes[1], true) delete(nodes, 3) delete(nodes, 4) raftutils.WaitForCluster(t, clockSource, nodes) // The memberlist should contain exactly one node (self) memberlist := nodes[1].GetMemberlist() require.Len(t, memberlist, 1) // Propose a 5th value _, err = raftutils.ProposeValue(t, nodes[1], DefaultProposalTime, nodeIDs[4]) require.NoError(t, err) }