// maybeSendLeaderEvent processes a raft.Ready to send events in response to leadership // changes (this includes both sending an event to the app and retrying any pending // proposals). // It may call into Storage so it must be called with the storage lock held. func (s *state) maybeSendLeaderEvent(groupID roachpb.RangeID, g *group, ready *raft.Ready) { s.assertStorageLocked() term := g.committedTerm if ready.SoftState != nil { // Always save the leader whenever it changes. if roachpb.ReplicaID(ready.SoftState.Lead) != g.leader.ReplicaID { if ready.SoftState.Lead == 0 { g.leader = roachpb.ReplicaDescriptor{} } else { if repl, err := s.ReplicaDescriptor(g.groupID, roachpb.ReplicaID(ready.SoftState.Lead)); err != nil { log.Warningf("node %s: failed to look up address of replica %d in group %d: %s", s.nodeID, ready.SoftState.Lead, g.groupID, err) g.leader = roachpb.ReplicaDescriptor{} } else { g.leader = repl } } } } if len(ready.CommittedEntries) > 0 { term = ready.CommittedEntries[len(ready.CommittedEntries)-1].Term } if term != g.committedTerm && g.leader.ReplicaID != 0 { // Whenever the committed term has advanced and we know our leader, // emit an event. g.committedTerm = term s.sendEvent(&EventLeaderElection{ GroupID: groupID, ReplicaID: g.leader.ReplicaID, Term: g.committedTerm, }) } }
// TestRaftAfterRemoveRange verifies that the raft state removes // a remote node correctly after the Replica was removed from the Store. func TestRaftAfterRemoveRange(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() // Make the split. splitArgs := adminSplitArgs(roachpb.KeyMin, []byte("b")) if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &splitArgs); err != nil { t.Fatal(err) } rangeID := roachpb.RangeID(2) mtc.replicateRange(rangeID, 1, 2) mtc.unreplicateRange(rangeID, 2) mtc.unreplicateRange(rangeID, 1) // Wait for the removal to be processed. util.SucceedsWithin(t, time.Second, func() error { _, err := mtc.stores[1].GetReplica(rangeID) if _, ok := err.(*roachpb.RangeNotFoundError); ok { return nil } else if err != nil { return err } return util.Errorf("range still exists") }) replica1 := roachpb.ReplicaDescriptor{ ReplicaID: roachpb.ReplicaID(mtc.stores[1].StoreID()), NodeID: roachpb.NodeID(mtc.stores[1].StoreID()), StoreID: mtc.stores[1].StoreID(), } replica2 := roachpb.ReplicaDescriptor{ ReplicaID: roachpb.ReplicaID(mtc.stores[2].StoreID()), NodeID: roachpb.NodeID(mtc.stores[2].StoreID()), StoreID: mtc.stores[2].StoreID(), } if err := mtc.transport.Send(&storage.RaftMessageRequest{ GroupID: 0, ToReplica: replica1, FromReplica: replica2, Message: raftpb.Message{ From: uint64(replica2.ReplicaID), To: uint64(replica1.ReplicaID), Type: raftpb.MsgHeartbeat, }}); err != nil { t.Fatal(err) } // Execute another replica change to ensure that raft has processed // the heartbeat just sent. mtc.replicateRange(roachpb.RangeID(1), 1) // Expire leases to ensure any remaining intent resolutions can complete. // TODO(bdarnell): understand why some tests need this. mtc.expireLeaderLeases() }
func TestMembershipChange(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() cluster := newTestCluster(nil, 4, stopper, t) defer stopper.Stop() // Create a group with a single member, cluster.nodes[0]. groupID := roachpb.RangeID(1) cluster.createGroup(groupID, 0, 1) // An automatic election is triggered since this is a single-node Raft group, // so we don't need to call triggerElection. // Consume and apply the membership change events. for i := 0; i < 4; i++ { go func(i int) { for { e, ok := <-cluster.events[i].MembershipChangeCommitted if !ok { return } e.Callback(nil) } }(i) } // Add each of the other three nodes to the cluster. for i := 1; i < 4; i++ { ch := cluster.nodes[0].ChangeGroupMembership(groupID, makeCommandID(), raftpb.ConfChangeAddNode, roachpb.ReplicaDescriptor{ NodeID: cluster.nodes[i].nodeID, StoreID: roachpb.StoreID(cluster.nodes[i].nodeID), ReplicaID: roachpb.ReplicaID(cluster.nodes[i].nodeID), }, nil) <-ch } // TODO(bdarnell): verify that the channel events are sent out correctly. /* for i := 0; i < 10; i++ { log.Infof("tick %d", i) cluster.tickers[0].Tick() time.Sleep(5 * time.Millisecond) } // Each node is notified of each other node's joining. for i := 0; i < 4; i++ { for j := 1; j < 4; j++ { select { case e := <-cluster.events[i].MembershipChangeCommitted: if e.NodeID != cluster.nodes[j].nodeID { t.Errorf("node %d expected event for %d, got %d", i, j, e.NodeID) } default: t.Errorf("node %d did not get expected event for %d", i, j) } } }*/ }
// elect is a simplified wrapper around triggerElection and waitForElection which // waits for the election to complete on all members of a group. // TODO(bdarnell): make this work when membership has been changed after creation. func (c *testCluster) elect(leaderIndex int, groupID roachpb.RangeID) { c.triggerElection(leaderIndex, groupID) for _, i := range c.groups[groupID] { el := c.waitForElection(i) // With the in-memory storage used in these tests, replica and node IDs are interchangeable. if el.ReplicaID != roachpb.ReplicaID(c.nodes[leaderIndex].nodeID) { c.t.Fatalf("wrong leader elected; wanted node %d but got event %v", leaderIndex, el) } if el.GroupID != groupID { c.t.Fatalf("expected election event for group %d but got %d", groupID, el.GroupID) } } }
// TestReproposeConfigChange verifies the behavior when multiple // configuration changes are in flight at once. Raft prohibits this, // but any configuration changes that are dropped by this rule should // be reproposed when the previous change completes. func TestReproposeConfigChange(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() defer stopper.Stop() const clusterSize = 4 const groupSize = 3 cluster := newTestCluster(nil, clusterSize, stopper, t) const groupID = roachpb.RangeID(1) const leader = 0 const proposer = 1 cluster.createGroup(groupID, leader, groupSize) cluster.elect(leader, groupID) targetDesc := roachpb.ReplicaDescriptor{ NodeID: cluster.nodes[groupSize].nodeID, StoreID: roachpb.StoreID(cluster.nodes[groupSize].nodeID), ReplicaID: roachpb.ReplicaID(cluster.nodes[groupSize].nodeID), } // Add a node and immediately remove it without waiting for the // first change to commit. addErrCh := cluster.nodes[proposer].ChangeGroupMembership(groupID, makeCommandID(), raftpb.ConfChangeAddNode, targetDesc, nil) removeErrCh := cluster.nodes[proposer].ChangeGroupMembership(groupID, makeCommandID(), raftpb.ConfChangeRemoveNode, targetDesc, nil) // The add command will commit first; then it needs to be applied. // Apply it on the proposer node before the leader. e := <-cluster.events[proposer].MembershipChangeCommitted e.Callback(nil) e = <-cluster.events[leader].MembershipChangeCommitted e.Callback(nil) // Now wait for both commands to commit. select { case err := <-addErrCh: if err != nil { t.Errorf("add failed: %s", err) } case <-time.After(time.Second): t.Errorf("add timed out") } select { case err := <-removeErrCh: if err != nil { t.Errorf("remove failed: %s", err) } case <-time.After(time.Second): t.Errorf("remove timed out") } }
// ReplicaIDForStore implements the Storage interface. func (m *MemoryStorage) ReplicaIDForStore(groupID roachpb.RangeID, storeID roachpb.StoreID) (roachpb.ReplicaID, error) { return roachpb.ReplicaID(storeID), nil }
// TestInOrderDelivery verifies that for a given pair of nodes, raft // messages are delivered in order. func TestInOrderDelivery(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() defer stopper.Stop() nodeRPCContext := rpc.NewContext(nodeTestBaseContext, hlc.NewClock(hlc.UnixNano), stopper) g := gossip.New(nodeRPCContext, gossip.TestBootstrap, stopper) g.SetNodeID(roachpb.NodeID(1)) rpcServer := rpc.NewServer(nodeRPCContext) grpcServer := grpc.NewServer() tlsConfig, err := nodeRPCContext.GetServerTLSConfig() if err != nil { t.Fatal(err) } ln, err := util.ListenAndServe(stopper, grpcutil.GRPCHandlerFunc(grpcServer, rpcServer), util.CreateTestAddr("tcp"), tlsConfig) if err != nil { t.Fatal(err) } const numMessages = 100 nodeID := roachpb.NodeID(roachpb.NodeID(2)) serverTransport := newRPCTransport(g, grpcServer, nodeRPCContext) defer serverTransport.Close() serverChannel := newChannelServer(numMessages, 10*time.Millisecond) if err := serverTransport.Listen(roachpb.StoreID(nodeID), serverChannel.RaftMessage); err != nil { t.Fatal(err) } addr := ln.Addr() // Have to set gossip.NodeID before call gossip.AddInofXXX g.SetNodeID(nodeID) if err := g.AddInfoProto(gossip.MakeNodeIDKey(nodeID), &roachpb.NodeDescriptor{ Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()), }, time.Hour); err != nil { t.Fatal(err) } clientNodeID := roachpb.NodeID(2) clientTransport := newRPCTransport(g, nil, nodeRPCContext) defer clientTransport.Close() for i := 0; i < numMessages; i++ { req := &storage.RaftMessageRequest{ GroupID: 1, Message: raftpb.Message{ To: uint64(nodeID), From: uint64(clientNodeID), Commit: uint64(i), }, ToReplica: roachpb.ReplicaDescriptor{ NodeID: nodeID, StoreID: roachpb.StoreID(nodeID), ReplicaID: roachpb.ReplicaID(nodeID), }, FromReplica: roachpb.ReplicaDescriptor{ NodeID: clientNodeID, StoreID: roachpb.StoreID(clientNodeID), ReplicaID: roachpb.ReplicaID(clientNodeID), }, } if err := clientTransport.Send(req); err != nil { t.Errorf("failed to send message %d: %s", i, err) } } for i := 0; i < numMessages; i++ { req := <-serverChannel.ch if req.Message.Commit != uint64(i) { t.Errorf("messages out of order: got %d while expecting %d", req.Message.Commit, i) } } }
// TestRemoveLeader ensures that a group will recover if a node is // removed from the group while it is leader. Since visibility into // the raft state is limited, we create a three-node group in a // six-node cluster. This group is migrated one node at a time from // the first three nodes to the last three. In the process the initial // leader must have removed itself. func TestRemoveLeader(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() const clusterSize = 6 const groupSize = 3 cluster := newTestCluster(nil, clusterSize, stopper, t) defer stopper.Stop() // Consume and apply the membership change events. for i := 0; i < clusterSize; i++ { go func(i int) { for { if e, ok := <-cluster.events[i].MembershipChangeCommitted; ok { e.Callback(nil) } else { return } } }(i) } // Tick all the clocks in the background to ensure that all the // necessary elections are triggered. // TODO(bdarnell): newTestCluster should have an option to use a // real clock instead of a manual one. stopper.RunWorker(func() { ticker := time.NewTicker(10 * time.Millisecond) defer ticker.Stop() for { select { case <-stopper.ShouldStop(): return case <-ticker.C: for _, t := range cluster.tickers { t.NonBlockingTick() } } } }) // Create a group with three members. groupID := roachpb.RangeID(1) cluster.createGroup(groupID, 0, groupSize) // Move the group one node at a time from the first three nodes to // the last three. In the process, we necessarily remove the leader // and trigger at least one new election among the new nodes. for i := 0; i < groupSize; i++ { log.Infof("adding node %d", i+groupSize) ch := cluster.nodes[i].ChangeGroupMembership(groupID, makeCommandID(), raftpb.ConfChangeAddNode, roachpb.ReplicaDescriptor{ NodeID: cluster.nodes[i+groupSize].nodeID, StoreID: roachpb.StoreID(cluster.nodes[i+groupSize].nodeID), ReplicaID: roachpb.ReplicaID(cluster.nodes[i+groupSize].nodeID), }, nil) if err := <-ch; err != nil { t.Fatal(err) } log.Infof("removing node %d", i) ch = cluster.nodes[i].ChangeGroupMembership(groupID, makeCommandID(), raftpb.ConfChangeRemoveNode, roachpb.ReplicaDescriptor{ NodeID: cluster.nodes[i].nodeID, StoreID: roachpb.StoreID(cluster.nodes[i].nodeID), ReplicaID: roachpb.ReplicaID(cluster.nodes[i].nodeID), }, nil) if err := <-ch; err != nil { t.Fatal(err) } } }
// createGroup is called in two situations: by the application at // startup (in which case the replicaID argument is zero and the // replicaID will be loaded from storage), and in response to incoming // messages (in which case the replicaID comes from the incoming // message, since nothing is on disk yet). func (s *state) createGroup(groupID roachpb.RangeID, replicaID roachpb.ReplicaID) error { if _, ok := s.groups[groupID]; ok { return nil } if log.V(3) { log.Infof("node %v creating group %v", s.nodeID, groupID) } gs, err := s.Storage.GroupStorage(groupID, replicaID) if err != nil { return err } _, cs, err := gs.InitialState() if err != nil { return err } // Find our store ID in the replicas list. for _, r := range cs.Nodes { repDesc, err := s.ReplicaDescriptor(groupID, roachpb.ReplicaID(r)) if err != nil { return err } if repDesc.StoreID == s.storeID { if replicaID == 0 { replicaID = repDesc.ReplicaID } else if replicaID != repDesc.ReplicaID { return util.Errorf("inconsistent replica ID: passed %d, but found %s by scanning ConfState for store %s", replicaID, repDesc.ReplicaID, s.storeID) } replicaID = repDesc.ReplicaID break } } if replicaID == 0 { return util.Errorf("couldn't find replica ID for this store (%s) in range %d", s.storeID, groupID) } s.CacheReplicaDescriptor(groupID, roachpb.ReplicaDescriptor{ ReplicaID: replicaID, NodeID: s.nodeID, StoreID: s.storeID, }) var appliedIndex uint64 if s.StateMachine != nil { appliedIndex, err = s.StateMachine.AppliedIndex(groupID) if err != nil { return err } } raftCfg := &raft.Config{ ID: uint64(replicaID), Applied: appliedIndex, ElectionTick: s.ElectionTimeoutTicks, HeartbeatTick: s.HeartbeatIntervalTicks, Storage: gs, // TODO(bdarnell): make these configurable; evaluate defaults. MaxSizePerMsg: 1024 * 1024, MaxInflightMsgs: 256, Logger: &raftLogger{group: uint64(groupID)}, } if err := s.multiNode.CreateGroup(uint64(groupID), raftCfg, nil); err != nil { return err } g := &group{ id: groupID, pending: map[string]*proposal{}, } s.groups[groupID] = g for _, id := range cs.Nodes { replicaID := roachpb.ReplicaID(id) replica, err := s.ReplicaDescriptor(groupID, replicaID) if err != nil { return err } if err := s.addNode(replica.NodeID, g); err != nil { return err } } // Automatically campaign and elect a leader for this group if there's // exactly one known node for this group. // // A grey area for this being correct happens in the case when we're // currently in the progress of adding a second node to the group, // with the change committed but not applied. // Upon restarting, the node would immediately elect itself and only // then apply the config change, where really it should be applying // first and then waiting for the majority (which would now require // two votes, not only its own). // However, in that special case, the second node has no chance to // be elected master while this node restarts (as it's aware of the // configuration and knows it needs two votes), so the worst that // could happen is both nodes ending up in candidate state, timing // out and then voting again. This is expected to be an extremely // rare event. if len(cs.Nodes) == 1 { replica, err := s.ReplicaDescriptor(groupID, roachpb.ReplicaID(cs.Nodes[0])) if err != nil { return err } if replica.StoreID == s.storeID { log.Infof("node %s campaigning because initial confstate is %v", s.nodeID, cs.Nodes) if err := s.multiNode.Campaign(context.Background(), uint64(groupID)); err != nil { return err } } } return nil }
// sendMessage sends a raft message on the given group. Coalesced heartbeats // address nodes, not groups; they will use the noGroup constant as groupID. func (s *state) sendMessage(g *group, msg raftpb.Message) { if log.V(6) { log.Infof("node %v sending message %.200s to %v", s.nodeID, raft.DescribeMessage(msg, s.EntryFormatter), msg.To) } groupID := noGroup var toReplica roachpb.ReplicaDescriptor var fromReplica roachpb.ReplicaDescriptor if g == nil { // No group (a coalesced heartbeat): To/From fields are NodeIDs. // TODO(bdarnell): test transports route by store ID, not node ID. // In tests they're always the same, so we can hack it here but // it would be better to fix the transports. // I think we need to fix this before we can support a range // with two replicas on different stores of the same node. toReplica.NodeID = roachpb.NodeID(msg.To) toReplica.StoreID = roachpb.StoreID(msg.To) fromReplica.NodeID = roachpb.NodeID(msg.From) fromReplica.StoreID = roachpb.StoreID(msg.From) } else { // Regular message: To/From fields are replica IDs. groupID = g.id var err error toReplica, err = s.ReplicaDescriptor(groupID, roachpb.ReplicaID(msg.To)) if err != nil { log.Warningf("failed to lookup recipient replica %d in group %d: %s", msg.To, groupID, err) return } fromReplica, err = s.ReplicaDescriptor(groupID, roachpb.ReplicaID(msg.From)) if err != nil { log.Warningf("failed to lookup sender replica %d in group %d: %s", msg.From, groupID, err) return } } if _, ok := s.nodes[toReplica.NodeID]; !ok { if log.V(4) { log.Infof("node %v: connecting to new node %v", s.nodeID, toReplica.NodeID) } if err := s.addNode(toReplica.NodeID, g); err != nil { log.Errorf("node %v: error adding group %v to node %v: %v", s.nodeID, groupID, toReplica.NodeID, err) } } err := s.Transport.Send(&RaftMessageRequest{ GroupID: groupID, ToReplica: toReplica, FromReplica: fromReplica, Message: msg, }) snapStatus := raft.SnapshotFinish if err != nil { log.Warningf("node %v failed to send message to %v: %s", s.nodeID, toReplica.NodeID, err) if groupID != noGroup { s.multiNode.ReportUnreachable(msg.To, uint64(groupID)) } snapStatus = raft.SnapshotFailure } if msg.Type == raftpb.MsgSnap { // TODO(bdarnell): add an ack for snapshots and don't report status until // ack, error, or timeout. if groupID != noGroup { s.multiNode.ReportSnapshot(msg.To, uint64(groupID), snapStatus) } } }
// processCommittedEntry tells the application that a command was committed. // Returns the commandID, or an empty string if the given entry was not a command. func (s *state) processCommittedEntry(groupID roachpb.RangeID, g *group, entry raftpb.Entry) string { var commandID string switch entry.Type { case raftpb.EntryNormal: // etcd raft occasionally adds a nil entry (e.g. upon election); ignore these. if entry.Data != nil { var command []byte commandID, command = decodeCommand(entry.Data) s.sendEvent(&EventCommandCommitted{ GroupID: groupID, CommandID: commandID, Command: command, Index: entry.Index, }) } case raftpb.EntryConfChange: cc := raftpb.ConfChange{} if err := cc.Unmarshal(entry.Data); err != nil { log.Fatalf("invalid ConfChange data: %s", err) } var payload []byte if len(cc.Context) > 0 { var ctx ConfChangeContext if err := ctx.Unmarshal(cc.Context); err != nil { log.Fatalf("invalid ConfChangeContext: %s", err) } commandID = ctx.CommandID payload = ctx.Payload s.CacheReplicaDescriptor(groupID, ctx.Replica) } replica, err := s.ReplicaDescriptor(groupID, roachpb.ReplicaID(cc.NodeID)) if err != nil { // TODO(bdarnell): stash Replica information somewhere so we can have it here // with no chance of failure. log.Fatalf("could not look up replica info (node %s, group %d, replica %d): %s", s.nodeID, groupID, cc.NodeID, err) } g.waitForCallback++ s.sendEvent(&EventMembershipChangeCommitted{ GroupID: groupID, CommandID: commandID, Index: entry.Index, Replica: replica, ChangeType: cc.Type, Payload: payload, Callback: func(err error) { var errStr string if err != nil { errStr = err.Error() // can't leak err into the callback } select { case s.callbackChan <- func() { if errStr == "" { if log.V(3) { log.Infof("node %v applying configuration change %v", s.nodeID, cc) } // TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs var err error switch cc.Type { case raftpb.ConfChangeAddNode: err = s.addNode(replica.NodeID, g) case raftpb.ConfChangeRemoveNode: err = s.removeNode(replica.NodeID, g) case raftpb.ConfChangeUpdateNode: // Updates don't concern multiraft, they are simply passed through. } if err != nil { log.Errorf("error applying configuration change %v: %s", cc, err) } s.multiNode.ApplyConfChange(uint64(groupID), cc) } else { log.Warningf("aborting configuration change: %s", errStr) s.multiNode.ApplyConfChange(uint64(groupID), raftpb.ConfChange{}) } // Re-submit all pending proposals that were held // while the config change was pending g.waitForCallback-- if g.waitForCallback <= 0 { for _, prop := range g.pending { s.propose(prop) } } }: case <-s.stopper.ShouldStop(): } }, }) } return commandID }
// TestInOrderDelivery verifies that for a given pair of nodes, raft // messages are delivered in order. func TestInOrderDelivery(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() nodeRPCContext := rpc.NewContext(testutils.NewNodeTestBaseContext(), nil, stopper) g := gossip.New(nodeRPCContext, nil, stopper) grpcServer := rpc.NewServer(nodeRPCContext) ln, err := util.ListenAndServeGRPC(stopper, grpcServer, util.TestAddr) if err != nil { t.Fatal(err) } const numMessages = 100 nodeID := roachpb.NodeID(roachpb.NodeID(2)) serverTransport := storage.NewRaftTransport(storage.GossipAddressResolver(g), grpcServer, nodeRPCContext) serverChannel := newChannelServer(numMessages, 10*time.Millisecond) serverTransport.Listen(roachpb.StoreID(nodeID), serverChannel.RaftMessage) addr := ln.Addr() // Have to set gossip.NodeID before call gossip.AddInofXXX. g.SetNodeID(nodeID) if err := g.AddInfoProto(gossip.MakeNodeIDKey(nodeID), &roachpb.NodeDescriptor{ Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()), }, time.Hour); err != nil { t.Fatal(err) } clientNodeID := roachpb.NodeID(2) clientTransport := storage.NewRaftTransport(storage.GossipAddressResolver(g), nil, nodeRPCContext) for i := 0; i < numMessages; i++ { req := &storage.RaftMessageRequest{ GroupID: 1, Message: raftpb.Message{ To: uint64(nodeID), From: uint64(clientNodeID), Commit: uint64(i), }, ToReplica: roachpb.ReplicaDescriptor{ NodeID: nodeID, StoreID: roachpb.StoreID(nodeID), ReplicaID: roachpb.ReplicaID(nodeID), }, FromReplica: roachpb.ReplicaDescriptor{ NodeID: clientNodeID, StoreID: roachpb.StoreID(clientNodeID), ReplicaID: roachpb.ReplicaID(clientNodeID), }, } if err := clientTransport.Send(req); err != nil { t.Errorf("failed to send message %d: %s", i, err) } } for i := 0; i < numMessages; i++ { req := <-serverChannel.ch if req.Message.Commit != uint64(i) { t.Errorf("messages out of order: got %d while expecting %d", req.Message.Commit, i) } } }
// processCommittedEntry tells the application that a command was committed. // Returns the commandID, or an empty string if the given entry was not a command. func (s *state) processCommittedEntry(groupID roachpb.RangeID, g *group, entry raftpb.Entry) string { var commandID string switch entry.Type { case raftpb.EntryNormal: var command []byte commandID, command = decodeCommand(entry.Data) s.sendEvent(&EventCommandCommitted{ GroupID: groupID, CommandID: commandID, Command: command, Index: entry.Index, }) case raftpb.EntryConfChange: cc := raftpb.ConfChange{} if err := cc.Unmarshal(entry.Data); err != nil { log.Fatalf("invalid ConfChange data: %s", err) } var payload []byte if len(cc.Context) > 0 { var ctx ConfChangeContext if err := ctx.Unmarshal(cc.Context); err != nil { log.Fatalf("invalid ConfChangeContext: %s", err) } commandID = ctx.CommandID payload = ctx.Payload s.CacheReplicaDescriptor(groupID, ctx.Replica) } replica, err := s.ReplicaDescriptor(groupID, roachpb.ReplicaID(cc.NodeID)) if err != nil { // TODO(bdarnell): stash Replica information somewhere so we can have it here // with no chance of failure. log.Fatalf("could not look up replica info (node %s, group %d, replica %d): %s", s.nodeID, groupID, cc.NodeID, err) } s.sendEvent(&EventMembershipChangeCommitted{ GroupID: groupID, CommandID: commandID, Index: entry.Index, Replica: replica, ChangeType: cc.Type, Payload: payload, Callback: func(err error) { select { case s.callbackChan <- func() { gInner, ok := s.groups[groupID] if !ok { log.Infof("group %d no longer exists, aborting configuration change", groupID) } else if gInner != g { log.Infof("passed in group and fetched group objects do not match\noriginal:%+v\nfetched:%+v\n, aborting configuration change", g, gInner) } else if err == nil { if log.V(3) { log.Infof("node %v applying configuration change %v", s.nodeID, cc) } // TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs switch cc.Type { case raftpb.ConfChangeAddNode: err = s.addNode(replica.NodeID, g) case raftpb.ConfChangeRemoveNode: err = s.removeNode(replica.NodeID, g) case raftpb.ConfChangeUpdateNode: // Updates don't concern multiraft, they are simply passed through. } if err != nil { log.Errorf("error applying configuration change %v: %s", cc, err) } g.raftGroup.ApplyConfChange(cc) } else { log.Warningf("aborting configuration change: %s", err) g.raftGroup.ApplyConfChange(raftpb.ConfChange{}) } }: case <-s.stopper.ShouldStop(): } }, }) } return commandID }
func TestSendAndReceive(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() defer stopper.Stop() nodeRPCContext := rpc.NewContext(nodeTestBaseContext, hlc.NewClock(hlc.UnixNano), stopper) g := gossip.New(nodeRPCContext, gossip.TestInterval, gossip.TestBootstrap) // Create several servers, each of which has two stores (A multiraft node ID addresses // a store). const numServers = 3 const storesPerServer = 2 const numStores = numServers * storesPerServer // servers has length numServers. servers := []*rpc.Server{} // All the rest have length numStores (note that several stores share a transport). nextNodeID := roachpb.NodeID(1) nodeIDs := []roachpb.NodeID{} transports := []multiraft.Transport{} channels := []channelServer{} for serverIndex := 0; serverIndex < numServers; serverIndex++ { server := rpc.NewServer(util.CreateTestAddr("tcp"), nodeRPCContext) if err := server.Start(); err != nil { t.Fatal(err) } defer server.Close() transport, err := newRPCTransport(g, server, nodeRPCContext) if err != nil { t.Fatalf("Unexpected error creating transport, Error: %s", err) } defer transport.Close() for store := 0; store < storesPerServer; store++ { nodeID := nextNodeID nextNodeID++ channel := newChannelServer(10, 0) if err := transport.Listen(roachpb.StoreID(nodeID), channel); err != nil { t.Fatal(err) } addr := server.Addr() if err := g.AddInfoProto(gossip.MakeNodeIDKey(nodeID), &roachpb.NodeDescriptor{ Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()), }, time.Hour); err != nil { t.Fatal(err) } nodeIDs = append(nodeIDs, nodeID) transports = append(transports, transport) channels = append(channels, channel) } servers = append(servers, server) } // Each store sends one message to each store. for from := 0; from < numStores; from++ { for to := 0; to < numStores; to++ { req := &multiraft.RaftMessageRequest{ GroupID: 1, Message: raftpb.Message{ From: uint64(nodeIDs[from]), To: uint64(nodeIDs[to]), Type: raftpb.MsgHeartbeat, }, FromReplica: roachpb.ReplicaDescriptor{ NodeID: nodeIDs[from], StoreID: roachpb.StoreID(nodeIDs[from]), ReplicaID: roachpb.ReplicaID(nodeIDs[from]), }, ToReplica: roachpb.ReplicaDescriptor{ NodeID: nodeIDs[to], StoreID: roachpb.StoreID(nodeIDs[to]), ReplicaID: roachpb.ReplicaID(nodeIDs[to]), }, } if err := transports[from].Send(req); err != nil { t.Errorf("Unable to send message from %d to %d: %s", nodeIDs[from], nodeIDs[to], err) } } } // Read all the messages from the channels. Note that the transport // does not guarantee in-order delivery between independent // transports, so we just verify that the right number of messages // end up in each channel. for to := 0; to < numStores; to++ { for from := 0; from < numStores; from++ { select { case req := <-channels[to].ch: if req.Message.To != uint64(nodeIDs[to]) { t.Errorf("invalid message received on channel %d (expected from %d): %+v", nodeIDs[to], nodeIDs[from], req) } case <-time.After(5 * time.Second): t.Fatal("timed out waiting for message") } } select { case req := <-channels[to].ch: t.Errorf("got unexpected message %+v on channel %d", req, nodeIDs[to]) default: } } }
const ( noGroup = roachpb.RangeID(0) reqBufferSize = 100 // TODO(bdarnell): Determine the right size for this cache. Should // the cache be partitioned so that replica descriptors from the // range descriptors (which are the bulk of the data and can be // reloaded from disk as needed) don't crowd out the // message/snapshot descriptors (whose necessity is short-lived but // cannot be recovered through other means if evicted)? maxReplicaDescCacheSize = 1000 // InvalidReplicaID is passed to the GroupStorage method when a // replica ID cannot be determined. InvalidReplicaID = roachpb.ReplicaID(-1) ) // An ErrGroupDeleted is returned for commands which are pending while their // group is deleted. var ErrGroupDeleted = errors.New("raft group deleted") // ErrStopped is returned for commands that could not be completed before the // node was stopped. var ErrStopped = errors.New("raft processing stopped") // Config contains the parameters necessary to construct a MultiRaft object. type Config struct { Storage Storage Transport Transport // Ticker may be nil to use real time and TickInterval.