// TestHeartbeatResponseFanout check 2 raft groups on the same node distribution, // but each group has different Term, heartbeat response from each group should // not disturb other group's Term or Leadership func TestHeartbeatResponseFanout(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() defer stopper.Stop() cluster := newTestCluster(nil, 3, stopper, t) groupID1 := proto.RaftID(1) cluster.createGroup(groupID1, 0, 3 /* replicas */) groupID2 := proto.RaftID(2) cluster.createGroup(groupID2, 0, 3 /* replicas */) leaderIndex := 0 cluster.elect(leaderIndex, groupID1) // GroupID2 will have 3 round of election, so it will have different // term with groupID1, but both leader on the same node. for i := 2; i >= 0; i-- { leaderIndex = i cluster.elect(leaderIndex, groupID2) } // Send a coalesced heartbeat. // Heartbeat response from groupID2 will have a big term than which from groupID1. cluster.nodes[0].coalescedHeartbeat() // Start submit a command to see if groupID1's leader changed? cluster.nodes[0].SubmitCommand(groupID1, makeCommandID(), []byte("command")) select { case _ = <-cluster.events[0].CommandCommitted: log.Infof("SubmitCommand succeed after Heartbeat Response fanout") case <-time.After(500 * time.Millisecond): t.Fatalf("No leader after Heartbeat Response fanout") } }
// TestHeartbeatResponseFanout check 2 raft groups on the same node distribution, // but each group has different Term, heartbeat response from each group should // not disturb other group's Term or Leadership func TestHeartbeatResponseFanout(t *testing.T) { defer leaktest.AfterTest(t) stopper := util.NewStopper() defer stopper.Stop() cluster := newTestCluster(nil, 3, stopper, t) groupID1 := proto.RaftID(1) cluster.createGroup(groupID1, 0, 3 /* replicas */) groupID2 := proto.RaftID(2) cluster.createGroup(groupID2, 0, 3 /* replicas */) leaderIndex := 0 cluster.triggerElection(leaderIndex, groupID1) event := cluster.waitForElection(leaderIndex) // Drain off the election event from other nodes. _ = cluster.waitForElection((leaderIndex + 1) % 3) _ = cluster.waitForElection((leaderIndex + 2) % 3) if event.GroupID != groupID1 { t.Fatalf("election event had incorrect groupid %v", event.GroupID) } if event.NodeID != cluster.nodes[leaderIndex].nodeID { t.Fatalf("expected %v to win election, but was %v", cluster.nodes[leaderIndex].nodeID, event.NodeID) } // GroupID2 will have 3 round of election, so it will have different // term with groupID1, but both leader on the same node. for i := 2; i >= 0; i-- { leaderIndex = i cluster.triggerElection(leaderIndex, groupID2) event = cluster.waitForElection(leaderIndex) _ = cluster.waitForElection((leaderIndex + 1) % 3) _ = cluster.waitForElection((leaderIndex + 2) % 3) if event.GroupID != groupID2 { t.Fatalf("election event had incorrect groupid %v", event.GroupID) } if event.NodeID != cluster.nodes[leaderIndex].nodeID { t.Fatalf("expected %v to win election, but was %v", cluster.nodes[leaderIndex].nodeID, event.NodeID) } } // Send a coalesced heartbeat. // Heartbeat response from groupID2 will have a big term than which from groupID1. cluster.nodes[0].coalescedHeartbeat() // Start submit a command to see if groupID1's leader changed? cluster.nodes[0].SubmitCommand(groupID1, makeCommandID(), []byte("command")) select { case _ = <-cluster.events[0].CommandCommitted: log.Infof("SubmitCommand succeed after Heartbeat Response fanout") case <-time.After(500 * time.Millisecond): t.Fatalf("No leader after Heartbeat Response fanout") } }
func TestSlowStorage(t *testing.T) { defer leaktest.AfterTest(t) stopper := util.NewStopper() cluster := newTestCluster(nil, 3, stopper, t) defer stopper.Stop() groupID := proto.RaftID(1) cluster.createGroup(groupID, 0, 3) cluster.triggerElection(0, groupID) cluster.waitForElection(0) // Block the storage on the last node. // TODO(bdarnell): there appear to still be issues if the storage is blocked during // the election. cluster.storages[2].Block() // Submit a command to the leader cluster.nodes[0].SubmitCommand(groupID, makeCommandID(), []byte("command")) // Even with the third node blocked, the other nodes can make progress. for i := 0; i < 2; i++ { events := cluster.events[i] log.Infof("waiting for event to be commited on node %v", i) commit := <-events.CommandCommitted if string(commit.Command) != "command" { t.Errorf("unexpected value in committed command: %v", commit.Command) } } // Ensure that node 2 is in fact blocked. time.Sleep(time.Millisecond) select { case commit := <-cluster.events[2].CommandCommitted: t.Errorf("didn't expect commits on node 2 but got %v", commit) default: } // After unblocking the third node, it will catch up. cluster.storages[2].Unblock() log.Infof("waiting for event to be commited on node 2") // When we unblock, the backlog is not guaranteed to be processed in order, // and in some cases the leader may need to retransmit some messages. for i := 0; i < 3; i++ { select { case commit := <-cluster.events[2].CommandCommitted: if string(commit.Command) != "command" { t.Errorf("unexpected value in committed command: %v", commit.Command) } return case <-time.After(5 * time.Millisecond): // Tick both node's clocks. The ticks on the follower node don't // really do anything, but they do ensure that that goroutine is // getting scheduled (and the real-time delay allows rpc responses // to pass between the nodes) cluster.tickers[0].Tick() cluster.tickers[2].Tick() } } }
func TestLeaderCache(t *testing.T) { defer leaktest.AfterTest(t) lc := newLeaderCache(3) if r := lc.Lookup(12); r.StoreID != 0 { t.Fatalf("lookup of missing key returned replica: %v", r) } replica := proto.Replica{StoreID: 1} lc.Update(5, replica) if r := lc.Lookup(5); r.StoreID != 1 { t.Errorf("expected %v, got %v", replica, r) } newReplica := proto.Replica{StoreID: 7} lc.Update(5, newReplica) r := lc.Lookup(5) if r.StoreID != 7 { t.Errorf("expected %v, got %v", newReplica, r) } lc.Update(5, proto.Replica{}) r = lc.Lookup(5) if r.StoreID != 0 { t.Fatalf("evicted leader returned: %v", r) } for i := 10; i < 20; i++ { lc.Update(proto.RaftID(i), replica) } if lc.Lookup(16).StoreID != 0 || lc.Lookup(17).StoreID == 0 { t.Errorf("unexpected policy used in cache") } }
// sendAttempt is invoked by Send. It temporarily truncates the arguments to // match the descriptor's EndKey (if necessary) and gathers and rearranges the // replicas before making a single attempt at sending the request. It returns // the result of sending the RPC; a potential error contained in the reply has // to be handled separately by the caller. func (ds *DistSender) sendAttempt(trace *tracer.Trace, args proto.Request, reply proto.Response, desc *proto.RangeDescriptor) error { defer trace.Epoch("sending RPC")() // Truncate the request to our current range, making sure not to // touch it unless we have to (it is illegal to send EndKey on // commands which do not operate on ranges). if endKey := args.Header().EndKey; endKey != nil && !endKey.Less(desc.EndKey) { defer func(k proto.Key) { args.Header().EndKey = k }(endKey) args.Header().EndKey = desc.EndKey } leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } return ds.sendRPC(trace, desc.RaftID, replicas, order, args, reply) }
// handleWriteReady converts a set of raft.Ready structs into a writeRequest // to be persisted, marks the group as writing and sends it to the writeTask. func (s *state) handleWriteReady(readyGroups map[uint64]raft.Ready) { if log.V(6) { log.Infof("node %v write ready, preparing request", s.nodeID) } writeRequest := newWriteRequest() for groupID, ready := range readyGroups { raftGroupID := proto.RaftID(groupID) g, ok := s.groups[raftGroupID] if !ok { if log.V(6) { log.Infof("dropping write request to group %d", groupID) } continue } g.writing = true gwr := &groupWriteRequest{} if !raft.IsEmptyHardState(ready.HardState) { gwr.state = ready.HardState } if !raft.IsEmptySnap(ready.Snapshot) { gwr.snapshot = ready.Snapshot } if len(ready.Entries) > 0 { gwr.entries = ready.Entries } writeRequest.groups[raftGroupID] = gwr } s.writeTask.in <- writeRequest }
// newTestRangeSet creates a new range set that has the count number of ranges. func newTestRangeSet(count int, t *testing.T) *testRangeSet { rs := &testRangeSet{rangesByKey: btree.New(64 /* degree */)} for i := 0; i < count; i++ { desc := &proto.RangeDescriptor{ RaftID: proto.RaftID(i), StartKey: proto.Key(fmt.Sprintf("%03d", i)), EndKey: proto.Key(fmt.Sprintf("%03d", i+1)), } // Initialize the range stat so the scanner can use it. rng := &Range{ stats: &rangeStats{ raftID: desc.RaftID, MVCCStats: engine.MVCCStats{ KeyBytes: 1, ValBytes: 2, KeyCount: 1, LiveCount: 1, }, }, } if err := rng.setDesc(desc); err != nil { t.Fatal(err) } if exRngItem := rs.rangesByKey.ReplaceOrInsert(rng); exRngItem != nil { t.Fatalf("failed to insert range %s", rng) } } return rs }
// TestRaftAfterRemoveRange verifies that the MultiRaft state removes // a remote node correctly after the Replica was removed from the Store. func TestRaftAfterRemoveRange(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() // Make the split. splitArgs := adminSplitArgs(proto.KeyMin, []byte("b"), proto.RaftID(1), mtc.stores[0].StoreID()) if _, err := mtc.stores[0].ExecuteCmd(context.Background(), &splitArgs); err != nil { t.Fatal(err) } raftID := proto.RaftID(2) mtc.replicateRange(raftID, 0, 1, 2) mtc.unreplicateRange(raftID, 0, 2) mtc.unreplicateRange(raftID, 0, 1) rng, err := mtc.stores[1].GetRange(raftID) if err != nil { t.Fatal(err) } // If the range removal happens before the range applies the replica config change, the group // will be re-created when MultiRaft receives a MsgApp. if err := util.IsTrueWithin(func() bool { return len(rng.Desc().Replicas) == 1 }, 1*time.Second); err != nil { t.Fatal(err) } // Remove the range from the second Store. if err := mtc.stores[1].RemoveRange(rng); err != nil { t.Fatal(err) } if err := mtc.transport.Send(&multiraft.RaftMessageRequest{ GroupID: proto.RaftID(0), Message: raftpb.Message{ From: uint64(mtc.stores[2].RaftNodeID()), To: uint64(mtc.stores[1].RaftNodeID()), Type: raftpb.MsgHeartbeat, }}); err != nil { t.Fatal(err) } // Execute another replica change to ensure that MultiRaft has processed the heartbeat just sent. mtc.replicateRange(proto.RaftID(1), 0, 1) }
func TestLocalSenderLookupReplica(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() defer stopper.Stop() ctx := storage.TestStoreContext manualClock := hlc.NewManualClock(0) ctx.Clock = hlc.NewClock(manualClock.UnixNano) ls := NewLocalSender() // Create two new stores with ranges we care about. var e [2]engine.Engine var s [2]*storage.Store ranges := []struct { storeID proto.StoreID start, end proto.Key }{ {2, proto.Key("a"), proto.Key("c")}, {3, proto.Key("x"), proto.Key("z")}, } for i, rng := range ranges { e[i] = engine.NewInMem(proto.Attributes{}, 1<<20) ctx.Transport = multiraft.NewLocalRPCTransport(stopper) defer ctx.Transport.Close() s[i] = storage.NewStore(ctx, e[i], &proto.NodeDescriptor{NodeID: 1}) s[i].Ident.StoreID = rng.storeID desc := &proto.RangeDescriptor{ RaftID: proto.RaftID(i), StartKey: rng.start, EndKey: rng.end, Replicas: []proto.Replica{{StoreID: rng.storeID}}, } newRng, err := storage.NewRange(desc, s[i]) if err != nil { t.Fatal(err) } if err := s[i].AddRangeTest(newRng); err != nil { t.Error(err) } ls.AddStore(s[i]) } if _, r, err := ls.lookupReplica(proto.Key("a"), proto.Key("c")); r.StoreID != s[0].Ident.StoreID || err != nil { t.Errorf("expected store %d; got %d: %v", s[0].Ident.StoreID, r.StoreID, err) } if _, r, err := ls.lookupReplica(proto.Key("b"), nil); r.StoreID != s[0].Ident.StoreID || err != nil { t.Errorf("expected store %d; got %d: %v", s[0].Ident.StoreID, r.StoreID, err) } if _, r, err := ls.lookupReplica(proto.Key("b"), proto.Key("d")); r != nil || err == nil { t.Errorf("expected store 0 and error got %d", r.StoreID) } if _, r, err := ls.lookupReplica(proto.Key("x"), proto.Key("z")); r.StoreID != s[1].Ident.StoreID { t.Errorf("expected store %d; got %d: %v", s[1].Ident.StoreID, r.StoreID, err) } if _, r, err := ls.lookupReplica(proto.Key("y"), nil); r.StoreID != s[1].Ident.StoreID || err != nil { t.Errorf("expected store %d; got %d: %v", s[1].Ident.StoreID, r.StoreID, err) } }
// DecodeRaftStateKey extracts the Raft ID from a RaftStateKey. func DecodeRaftStateKey(key proto.Key) proto.RaftID { if !bytes.HasPrefix(key, LocalRangeIDPrefix) { panic(fmt.Sprintf("key %q does not have %q prefix", key, LocalRangeIDPrefix)) } // Cut the prefix and the Raft ID. b := key[len(LocalRangeIDPrefix):] _, raftID := encoding.DecodeUvarint(b) return proto.RaftID(raftID) }
func TestMembershipChange(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() cluster := newTestCluster(nil, 4, stopper, t) defer stopper.Stop() // Create a group with a single member, cluster.nodes[0]. groupID := proto.RaftID(1) cluster.createGroup(groupID, 0, 1) // An automatic election is triggered since this is a single-node Raft group, // so we don't need to call triggerElection. // Consume and apply the membership change events. for i := 0; i < 4; i++ { go func(i int) { for { e, ok := <-cluster.events[i].MembershipChangeCommitted if !ok { return } e.Callback(nil) } }(i) } // Add each of the other three nodes to the cluster. for i := 1; i < 4; i++ { ch := cluster.nodes[0].ChangeGroupMembership(groupID, makeCommandID(), raftpb.ConfChangeAddNode, cluster.nodes[i].nodeID, nil) <-ch } // TODO(bdarnell): verify that the channel events are sent out correctly. /* for i := 0; i < 10; i++ { log.Infof("tick %d", i) cluster.tickers[0].Tick() time.Sleep(5 * time.Millisecond) } // Each node is notified of each other node's joining. for i := 0; i < 4; i++ { for j := 1; j < 4; j++ { select { case e := <-cluster.events[i].MembershipChangeCommitted: if e.NodeID != cluster.nodes[j].nodeID { t.Errorf("node %d expected event for %d, got %d", i, j, e.NodeID) } default: t.Errorf("node %d did not get expected event for %d", i, j) } } }*/ }
func TestLeaderElectionEvent(t *testing.T) { defer leaktest.AfterTest(t) // Leader election events are fired when the leader commits an entry, not when it // issues a call for votes. stopper := stop.NewStopper() cluster := newTestCluster(nil, 3, stopper, t) defer stopper.Stop() groupID := proto.RaftID(1) cluster.createGroup(groupID, 0, 3) // Process a Ready with a new leader but no new commits. // This happens while an election is in progress. cluster.nodes[1].maybeSendLeaderEvent(groupID, cluster.nodes[1].groups[groupID], &raft.Ready{ SoftState: &raft.SoftState{ Lead: 3, }, }) // No events are sent. select { case e := <-cluster.events[1].LeaderElection: t.Fatalf("got unexpected event %v", e) case <-time.After(time.Millisecond): } // Now there are new committed entries. A new leader always commits an entry // to conclude the election. entry := raftpb.Entry{ Index: 42, Term: 42, } cluster.nodes[1].maybeSendLeaderEvent(groupID, cluster.nodes[1].groups[groupID], &raft.Ready{ Entries: []raftpb.Entry{entry}, CommittedEntries: []raftpb.Entry{entry}, }) // Now we get an event. select { case e := <-cluster.events[1].LeaderElection: if !reflect.DeepEqual(e, &EventLeaderElection{ GroupID: groupID, NodeID: 3, Term: 42, }) { t.Errorf("election event did not match expectations: %+v", e) } case <-time.After(time.Millisecond): t.Fatal("didn't get expected event") } }
// TestReplicateAfterSplit verifies that a new replica whose start key // is not KeyMin replicating to a fresh store can apply snapshots correctly. func TestReplicateAfterSplit(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 2) defer mtc.Stop() raftID := proto.RaftID(1) splitKey := proto.Key("m") key := proto.Key("z") store0 := mtc.stores[0] // Make the split splitArgs := adminSplitArgs(proto.KeyMin, splitKey, raftID, store0.StoreID()) if _, err := store0.ExecuteCmd(context.Background(), &splitArgs); err != nil { t.Fatal(err) } raftID2 := store0.LookupRange(key, nil).Desc().RaftID if raftID2 == raftID { t.Errorf("got same raft id after split") } // Issue an increment for later check. incArgs := incrementArgs(key, 11, raftID2, store0.StoreID()) if _, err := store0.ExecuteCmd(context.Background(), &incArgs); err != nil { t.Fatal(err) } // Now add the second replica. mtc.replicateRange(raftID2, 0, 1) if mtc.stores[1].LookupRange(key, nil).GetMaxBytes() == 0 { t.Error("Range MaxBytes is not set after snapshot applied") } // Once it catches up, the effects of increment commands can be seen. if err := util.IsTrueWithin(func() bool { getArgs := getArgs(key, raftID2, mtc.stores[1].StoreID()) // Reading on non-leader replica should use inconsistent read getArgs.ReadConsistency = proto.INCONSISTENT reply, err := mtc.stores[1].ExecuteCmd(context.Background(), &getArgs) if err != nil { return false } getResp := reply.(*proto.GetResponse) if log.V(1) { log.Infof("read value %d", mustGetInteger(getResp.Value)) } return mustGetInteger(getResp.Value) == 11 }, 1*time.Second); err != nil { t.Fatal(err) } }
func TestInitialLeaderElection(t *testing.T) { defer leaktest.AfterTest(t) // Run the test three times, each time triggering a different node's election clock. // The node that requests an election first should win. for leaderIndex := 0; leaderIndex < 3; leaderIndex++ { log.Infof("testing leader election for node %v", leaderIndex) stopper := stop.NewStopper() cluster := newTestCluster(nil, 3, stopper, t) groupID := proto.RaftID(1) cluster.createGroup(groupID, 0, 3) cluster.elect(leaderIndex, groupID) stopper.Stop() } }
// TestProgressWithDownNode verifies that a surviving quorum can make progress // with a downed node. func TestProgressWithDownNode(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() raftID := proto.RaftID(1) mtc.replicateRange(raftID, 0, 1, 2) incArgs, incResp := incrementArgs([]byte("a"), 5, raftID, mtc.stores[0].StoreID()) if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil { t.Fatal(err) } // Verify that the first increment propagates to all the engines. verify := func(expected []int64) { util.SucceedsWithin(t, time.Second, func() error { values := []int64{} for _, eng := range mtc.engines { val, _, err := engine.MVCCGet(eng, proto.Key("a"), mtc.clock.Now(), true, nil) if err != nil { return err } values = append(values, mustGetInteger(val)) } if !reflect.DeepEqual(expected, values) { return util.Errorf("expected %v, got %v", expected, values) } return nil }) } verify([]int64{5, 5, 5}) // Stop one of the replicas and issue a new increment. mtc.stopStore(1) incArgs, incResp = incrementArgs([]byte("a"), 11, raftID, mtc.stores[0].StoreID()) if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil { t.Fatal(err) } // The new increment can be seen on both live replicas. verify([]int64{16, 5, 16}) // Once the downed node is restarted, it will catch up. mtc.restartStore(1) verify([]int64{16, 16, 16}) }
func TestSlowStorage(t *testing.T) { defer leaktest.AfterTest(t) stopper := util.NewStopper() cluster := newTestCluster(nil, 3, stopper, t) defer stopper.Stop() groupID := proto.RaftID(1) cluster.createGroup(groupID, 0, 3) cluster.triggerElection(0, groupID) cluster.waitForElection(0) // Block the storage on the last node. // TODO(bdarnell): there appear to still be issues if the storage is blocked during // the election. cluster.storages[2].Block() // Submit a command to the leader cluster.nodes[0].SubmitCommand(groupID, makeCommandID(), []byte("command")) // Even with the third node blocked, the other nodes can make progress. for i := 0; i < 2; i++ { events := cluster.events[i] log.Infof("waiting for event to be commited on node %v", i) commit := <-events.CommandCommitted if string(commit.Command) != "command" { t.Errorf("unexpected value in committed command: %v", commit.Command) } } // Ensure that node 2 is in fact blocked. time.Sleep(time.Millisecond) select { case commit := <-cluster.events[2].CommandCommitted: t.Errorf("didn't expect commits on node 2 but got %v", commit) default: } // After unblocking the third node, it will catch up. cluster.storages[2].Unblock() cluster.tickers[0].Tick() log.Infof("waiting for event to be commited on node 2") commit := <-cluster.events[2].CommandCommitted if string(commit.Command) != "command" { t.Errorf("unexpected value in committed command: %v", commit.Command) } }
// TestStoreRaftIDAllocation verifies that raft IDs are // allocated in successive blocks. func TestStoreRaftIDAllocation(t *testing.T) { defer leaktest.AfterTest(t) store, _, stopper := createTestStore(t) defer stopper.Stop() // Raft IDs should be allocated from ID 2 (first alloc'd range) // to raftIDAllocCount * 3 + 1. for i := 0; i < raftIDAllocCount*3; i++ { replicas := []proto.Replica{{StoreID: store.StoreID()}} desc, err := store.NewRangeDescriptor(proto.Key(fmt.Sprintf("%03d", i)), proto.Key(fmt.Sprintf("%03d", i+1)), replicas) if err != nil { t.Fatal(err) } if desc.RaftID != proto.RaftID(2+i) { t.Errorf("expected Raft id %d; got %d", 2+i, desc.RaftID) } } }
// TestRangeGCQueueDropReplica verifies that the range GC queue // removes a range from a store that no longer should have a replica. func TestRangeGCQueueDropReplica(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() raftID := proto.RaftID(1) mtc.replicateRange(raftID, 0, 1, 2) mtc.unreplicateRange(raftID, 0, 1) // Increment the clock's timestamp to expire the leader lease. mtc.manualClock.Increment(int64(storage.DefaultLeaderLeaseDuration) + 1) // Make sure the range is not yet removed from the store. numTrials := 3 for i := 0; i < numTrials; i++ { store := mtc.stores[1] store.ForceRangeGCScan(t) if _, err := store.GetRange(raftID); err != nil { t.Error("unexpected range removal") } time.Sleep(10 * time.Millisecond) } // Increment the clock's timestamp to make the range GC queue process the range. mtc.manualClock.Increment(int64(storage.RangeGCQueueUnleasedDuration)) // Make sure the range is removed from the store. util.SucceedsWithin(t, time.Second, func() error { store := mtc.stores[1] store.ForceRangeGCScan(t) if _, err := store.GetRange(raftID); err == nil { return util.Error("expected range removal") } return nil }) // Restart the store to tear down the test cleanly. mtc.stopStore(1) mtc.restartStore(1) }
func TestCommand(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() cluster := newTestCluster(nil, 3, stopper, t) defer stopper.Stop() groupID := proto.RaftID(1) cluster.createGroup(groupID, 0, 3) cluster.triggerElection(0, groupID) // Submit a command to the leader cluster.nodes[0].SubmitCommand(groupID, makeCommandID(), []byte("command")) // The command will be committed on each node. for i, events := range cluster.events { log.Infof("waiting for event to be committed on node %v", i) commit := <-events.CommandCommitted if string(commit.Command) != "command" { t.Errorf("unexpected value in committed command: %v", commit.Command) } } }
func TestInitialLeaderElection(t *testing.T) { defer leaktest.AfterTest(t) // Run the test three times, each time triggering a different node's election clock. // The node that requests an election first should win. for leaderIndex := 0; leaderIndex < 3; leaderIndex++ { log.Infof("testing leader election for node %v", leaderIndex) stopper := util.NewStopper() cluster := newTestCluster(nil, 3, stopper, t) groupID := proto.RaftID(1) cluster.createGroup(groupID, 0, 3) cluster.triggerElection(leaderIndex, groupID) event := cluster.waitForElection(leaderIndex) if event.GroupID != groupID { t.Fatalf("election event had incorrect group id %v", event.GroupID) } if event.NodeID != cluster.nodes[leaderIndex].nodeID { t.Fatalf("expected %v to win election, but was %v", cluster.nodes[leaderIndex].nodeID, event.NodeID) } stopper.Stop() } }
// handleWriteResponse updates the state machine and sends messages for a raft Ready batch. func (s *state) handleWriteResponse(response *writeResponse, readyGroups map[uint64]raft.Ready) { if log.V(6) { log.Infof("node %v got write response: %#v", s.nodeID, *response) } // Everything has been written to disk; now we can apply updates to the state machine // and send outgoing messages. for groupID, ready := range readyGroups { raftGroupID := proto.RaftID(groupID) g, ok := s.groups[raftGroupID] if !ok { if log.V(4) { log.Infof("dropping stale write to group %v", groupID) } continue } else if !g.writing { if log.V(4) { log.Infof("dropping stale write to reincarnation of group %v", groupID) } delete(readyGroups, groupID) // they must not make it to Advance. continue } g.writing = false // Process committed entries. for _, entry := range ready.CommittedEntries { commandID := s.processCommittedEntry(raftGroupID, g, entry) // TODO(bdarnell): the command is now committed, but not applied until the // application consumes EventCommandCommitted. Is returning via the channel // at this point useful or do we need to wait for the command to be // applied too? // This could be done with a Callback as in EventMembershipChangeCommitted // or perhaps we should move away from a channel to a callback-based system. s.removePending(g, g.pending[commandID], nil /* err */) } if !raft.IsEmptySnap(ready.Snapshot) { // Sync the group/node mapping with the information contained in the snapshot. for _, nodeID := range ready.Snapshot.Metadata.ConfState.Nodes { // TODO(bdarnell): if we had any information that predated this snapshot // we must remove those nodes. if err := s.addNode(proto.RaftNodeID(nodeID), raftGroupID); err != nil { log.Errorf("node %v: error adding node %v", s.nodeID, nodeID) } } } // Process SoftState and leader changes. s.maybeSendLeaderEvent(raftGroupID, g, &ready) // Send all messages. for _, msg := range ready.Messages { switch msg.Type { case raftpb.MsgHeartbeat: if log.V(8) { log.Infof("node %v dropped individual heartbeat to node %v", s.nodeID, msg.To) } case raftpb.MsgHeartbeatResp: if log.V(8) { log.Infof("node %v dropped individual heartbeat response to node %v", s.nodeID, msg.To) } default: s.sendMessage(raftGroupID, msg) } } } }
// processCommittedEntry tells the application that a command was committed. // Returns the commandID, or an empty string if the given entry was not a command. func (s *state) processCommittedEntry(groupID proto.RaftID, g *group, entry raftpb.Entry) string { var commandID string switch entry.Type { case raftpb.EntryNormal: // etcd raft occasionally adds a nil entry (e.g. upon election); ignore these. if entry.Data != nil { var command []byte commandID, command = decodeCommand(entry.Data) s.sendEvent(&EventCommandCommitted{ GroupID: groupID, CommandID: commandID, Command: command, Index: entry.Index, }) } case raftpb.EntryConfChange: cc := raftpb.ConfChange{} if err := cc.Unmarshal(entry.Data); err != nil { log.Fatalf("invalid ConfChange data: %s", err) } var payload []byte if len(cc.Context) > 0 { commandID, payload = decodeCommand(cc.Context) } s.sendEvent(&EventMembershipChangeCommitted{ GroupID: groupID, CommandID: commandID, Index: entry.Index, NodeID: proto.RaftNodeID(cc.NodeID), ChangeType: cc.Type, Payload: payload, Callback: func(err error) { select { case s.callbackChan <- func() { if err == nil { if log.V(3) { log.Infof("node %v applying configuration change %v", s.nodeID, cc) } // TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs switch cc.Type { case raftpb.ConfChangeAddNode: err = s.addNode(proto.RaftNodeID(cc.NodeID), proto.RaftID(groupID)) case raftpb.ConfChangeRemoveNode: // TODO(bdarnell): support removing nodes; fix double-application of initial entries case raftpb.ConfChangeUpdateNode: // Updates don't concern multiraft, they are simply passed through. } if err != nil { log.Errorf("error applying configuration change %v: %s", cc, err) } s.multiNode.ApplyConfChange(uint64(groupID), cc) } else { log.Warningf("aborting configuration change: %s", err) s.multiNode.ApplyConfChange(uint64(groupID), raftpb.ConfChange{}) } // Re-submit all pending proposals, in case any of them were config changes // that were dropped due to the one-at-a-time rule. This is a little // redundant since most pending proposals won't benefit from this but // config changes should be rare enough (and the size of the pending queue // small enough) that it doesn't really matter. for _, prop := range g.pending { s.propose(prop) } }: case <-s.stopper.ShouldStop(): } }, }) } return commandID }
func TestReplicateAddAndRemove(t *testing.T) { defer leaktest.AfterTest(t) // Run the test twice, once adding the replacement before removing // the downed node, and once removing the downed node first. for _, addFirst := range []bool{true, false} { mtc := startMultiTestContext(t, 4) defer mtc.Stop() // Replicate the initial range to three of the four nodes. raftID := proto.RaftID(1) mtc.replicateRange(raftID, 0, 3, 1) incArgs, incResp := incrementArgs([]byte("a"), 5, raftID, mtc.stores[0].StoreID()) if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil { t.Fatal(err) } verify := func(expected []int64) { util.SucceedsWithin(t, time.Second, func() error { values := []int64{} for _, eng := range mtc.engines { val, _, err := engine.MVCCGet(eng, proto.Key("a"), mtc.clock.Now(), true, nil) if err != nil { return err } values = append(values, mustGetInteger(val)) } if !reflect.DeepEqual(expected, values) { return util.Errorf("expected %v, got %v", expected, values) } return nil }) } // The first increment is visible on all three replicas. verify([]int64{5, 5, 0, 5}) // Stop a store and replace it. mtc.stopStore(1) if addFirst { mtc.replicateRange(raftID, 0, 2) mtc.unreplicateRange(raftID, 0, 1) } else { mtc.unreplicateRange(raftID, 0, 1) mtc.replicateRange(raftID, 0, 2) } verify([]int64{5, 5, 5, 5}) // Ensure that the rest of the group can make progress. incArgs, incResp = incrementArgs([]byte("a"), 11, raftID, mtc.stores[0].StoreID()) if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil { t.Fatal(err) } verify([]int64{16, 5, 16, 16}) // Bring the downed store back up (required for a clean shutdown). mtc.restartStore(1) // Node 1 never sees the increment that was added while it was // down. Perform another increment on the live nodes to verify. incArgs, incResp = incrementArgs([]byte("a"), 23, raftID, mtc.stores[0].StoreID()) if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil { t.Fatal(err) } verify([]int64{39, 5, 39, 39}) // TODO(bdarnell): when we have GC of removed ranges, verify that // the downed node removes the data from this range after coming // back up. // Wait out the leader lease and the unleased duration to make the range GC'able. mtc.manualClock.Increment(int64(storage.DefaultLeaderLeaseDuration) + int64(storage.RangeGCQueueUnleasedDuration) + 1) mtc.stores[1].ForceRangeGCScan(t) // The removed store no longer has any of the data from the range. verify([]int64{39, 0, 39, 39}) } }
import ( "errors" "fmt" "time" "github.com/cockroachdb/cockroach/proto" "github.com/cockroachdb/cockroach/util" "github.com/cockroachdb/cockroach/util/log" "github.com/coreos/etcd/raft" "github.com/coreos/etcd/raft/raftpb" "golang.org/x/net/context" ) const ( noGroup = proto.RaftID(0) ) // An ErrGroupDeleted is returned for commands which are pending while their // group is deleted. var ErrGroupDeleted = errors.New("group deleted") // ErrStopped is returned for commands that could not be completed before the // node was stopped. var ErrStopped = errors.New("stopped") // Config contains the parameters necessary to construct a MultiRaft object. type Config struct { Storage Storage Transport Transport // Ticker may be nil to use real time and TickInterval.
// TestMultiStoreEventFeed verifies that events on multiple stores are properly // recieved by a single event reader. func TestMultiStoreEventFeed(t *testing.T) { defer leaktest.AfterTest(t) t.Skip("disabled until #1531 is fixed") // Create a multiTestContext which publishes all store events to the given // feed. feed := &util.Feed{} mtc := &multiTestContext{ feed: feed, } // Start reading events from the feed before starting the stores. ser := &storeEventReader{ recordUpdateDetail: false, } readStopper := stop.NewStopper() sub := feed.Subscribe() readStopper.RunWorker(func() { ser.readEvents(sub) }) mtc.Start(t, 3) defer mtc.Stop() // Replicate the default range. raftID := proto.RaftID(1) mtc.replicateRange(raftID, 0, 1, 2) // Add some data in a transaction err := mtc.db.Txn(func(txn *client.Txn) error { b := &client.Batch{} b.Put("a", "asdf") b.Put("c", "jkl;") return txn.Commit(b) }) if err != nil { t.Fatalf("error putting data to db: %s", err) } // AdminSplit in between the two ranges. if err := mtc.db.AdminSplit("b"); err != nil { t.Fatalf("error splitting initial: %s", err) } // AdminSplit an empty range at the end of the second range. if err := mtc.db.AdminSplit("z"); err != nil { t.Fatalf("error splitting second range: %s", err) } // AdminMerge the empty range back into the second range. if err := mtc.db.AdminMerge("c"); err != nil { t.Fatalf("error merging final range: %s", err) } // Add an additional put through the system and wait for all // replicas to receive it. if _, err := mtc.db.Inc("aa", 5); err != nil { t.Fatalf("error putting data to db: %s", err) } util.SucceedsWithin(t, time.Second, func() error { for _, eng := range mtc.engines { val, _, err := engine.MVCCGet(eng, proto.Key("aa"), mtc.clock.Now(), true, nil) if err != nil { return err } if a, e := mustGetInteger(val), int64(5); a != e { return util.Errorf("expected aa = %d, got %d", e, a) } } return nil }) // Close feed and wait for reader to receive all events. feed.Close() readStopper.Stop() // Compare events to expected values. expected := map[proto.StoreID][]string{ proto.StoreID(1): { "StartStore", "BeginScanRanges", "RegisterRange scan=true, rid=1, live=.*", "EndScanRanges", "SplitRange origId=1, newId=2, origKey=336, newKey=15", "SplitRange origId=2, newId=3, origKey=15, newKey=0", "MergeRange rid=2, subId=3, key=15, subKey=0", }, proto.StoreID(2): { "StartStore", "BeginScanRanges", "EndScanRanges", "RegisterRange scan=false, rid=1, live=.*", "SplitRange origId=1, newId=2, origKey=336, newKey=15", "SplitRange origId=2, newId=3, origKey=15, newKey=0", "MergeRange rid=2, subId=3, key=15, subKey=0", }, proto.StoreID(3): { "StartStore", "BeginScanRanges", "EndScanRanges", "RegisterRange scan=false, rid=1, live=.*", "SplitRange origId=1, newId=2, origKey=336, newKey=15", "SplitRange origId=2, newId=3, origKey=15, newKey=0", "MergeRange rid=2, subId=3, key=15, subKey=0", }, } if a, e := ser.perStoreFeeds, expected; !checkMatch(e, a) { t.Errorf("event feed did not match expected value. Actual values have been printed to compare with above expectation.\n") t.Logf("Event feed information:\n%s", ser.eventFeedString()) } // Expected count of update events on a per-method basis. expectedUpdateCount := map[proto.StoreID]map[proto.Method]int{ proto.StoreID(1): { proto.Put: 18, proto.ConditionalPut: 7, proto.Increment: 2, proto.Delete: 2, proto.EndTransaction: 6, proto.InternalLeaderLease: 3, }, proto.StoreID(2): { proto.Put: 16, proto.ConditionalPut: 6, proto.Increment: 2, proto.Delete: 2, proto.EndTransaction: 5, proto.InternalLeaderLease: 2, }, proto.StoreID(3): { proto.Put: 14, proto.ConditionalPut: 5, proto.Increment: 2, proto.Delete: 2, proto.EndTransaction: 4, proto.InternalLeaderLease: 2, }, } if a, e := ser.perStoreUpdateCount, expectedUpdateCount; !reflect.DeepEqual(a, e) { t.Errorf("update counts did not match expected value. Actual values have been printed to compare with above expectation.\n") t.Logf("Update count information:\n%s", ser.updateCountString()) } }
// Send implements the client.Sender interface. It verifies // permissions and looks up the appropriate range based on the // supplied key and sends the RPC according to the specified options. // // If the request spans multiple ranges (which is possible for Scan or // DeleteRange requests), Send sends requests to the individual ranges // sequentially and combines the results transparently. // // This may temporarily adjust the request headers, so the proto.Call // must not be used concurrently until Send has returned. func (ds *DistSender) Send(_ context.Context, call proto.Call) { args := call.Args finalReply := call.Reply // Verify permissions. if err := ds.verifyPermissions(call.Args); err != nil { call.Reply.Header().SetGoError(err) return } // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) { // Make sure that after the call, args hasn't changed. defer func(timestamp proto.Timestamp) { args.Header().Timestamp = timestamp }(args.Header().Timestamp) args.Header().Timestamp = ds.clock.Now() } // If this is a bounded request, we will change its bound as we receive // replies. This undoes that when we return. boundedArgs, argsBounded := args.(proto.Bounded) if argsBounded { defer func(bound int64) { boundedArgs.SetBound(bound) }(boundedArgs.GetBound()) } defer func(key proto.Key) { args.Header().Key = key }(args.Header().Key) // Retry logic for lookup of range by key and RPCs to range replicas. curReply := finalReply for { call.Reply = curReply curReply.Header().Reset() var desc, descNext *proto.RangeDescriptor var err error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). // sendAttempt below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. desc, descNext, err = ds.getDescriptors(call) // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(util.Retryable); ok && rErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } break } err = func() error { // Truncate the request to our current range, making sure not to // touch it unless we have to (it is illegal to send EndKey on // commands which do not operate on ranges). if descNext != nil { defer func(endKey proto.Key) { args.Header().EndKey = endKey }(args.Header().EndKey) args.Header().EndKey = desc.EndKey } leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } return ds.sendRPC(desc.RaftID, replicas, order, args, curReply) }() if err != nil { // For an RPC error to occur, we must've been unable to contact any // replicas. In this case, likely all nodes are down (or not getting back // to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date replicas, so // clearing the descriptor here should be a good idea. // TODO(tschottdorf): If a replica group goes dead, this will cause clients // to put high read pressure on the first range, so there should be some // rate limiting here. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } else { err = curReply.Header().GoError() } if err != nil { if log.V(1) { log.Warningf("failed to invoke %s: %s", call.Method(), err) } // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(err) } continue case *proto.NotLeaderError: newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader) if log.V(1) { log.Warning(err) } r.Reset() continue case util.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } } } break } // Immediately return if querying a range failed non-retryably. // For multi-range requests, we return the failing range's reply. if err != nil { call.Reply.Header().SetGoError(err) return } if finalReply != curReply { // This was the second or later call in a multi-range request. // Combine the new response with the existing one. if cFinalReply, ok := finalReply.(proto.Combinable); ok { cFinalReply.Combine(curReply) } else { // This should never apply in practice, as we'll only end up here // for range-spanning requests. call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type")) return } } // If this request has a bound, such as MaxResults in // ScanRequest, check whether enough rows have been retrieved. if argsBounded { if prevBound := boundedArgs.GetBound(); prevBound > 0 { if cReply, ok := curReply.(proto.Countable); ok { if nextBound := prevBound - cReply.Count(); nextBound > 0 { // Update bound for the next round. // We've deferred restoring the original bound earlier. boundedArgs.SetBound(nextBound) } else { // Set flag to break the loop. descNext = nil } } } } // If this was the last range accessed by this call, exit loop. if descNext == nil { break } // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. args.Header().Key = desc.EndKey // This is a multi-range request, make a new reply object for // subsequent iterations of the loop. curReply = args.CreateReply() } call.Reply = finalReply }
func TestRapidMembershipChange(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() defer stopper.Stop() var wg sync.WaitGroup proposers := 5 numCommit := int32(200) cluster := newTestCluster(nil, 1, stopper, t) groupID := proto.RaftID(1) cluster.createGroup(groupID, 0, 1 /* replicas */) startSeq := int32(0) // updated atomically from now on cmdIDFormat := "%0" + fmt.Sprintf("%d", commandIDLen) + "d" teardown := make(chan struct{}) proposerFn := func(i int) { defer wg.Done() var seq int32 for { seq = atomic.AddInt32(&startSeq, 1) if seq > numCommit { break } cmdID := fmt.Sprintf(cmdIDFormat, seq) retry: for { if err := cluster.nodes[0].CreateGroup(groupID); err != nil { t.Fatal(err) } if log.V(1) { log.Infof("%-3d: try %s", i, cmdID) } select { case err := <-cluster.nodes[0].SubmitCommand(groupID, cmdID, []byte("command")): if err == nil { log.Infof("%-3d: ok %s", i, cmdID) break retry } log.Infof("%-3d: err %s %s", i, cmdID, err) case <-teardown: return } } if err := cluster.nodes[0].RemoveGroup(groupID); err != nil { t.Fatal(err) } } } for i := 0; i < proposers; i++ { wg.Add(1) go proposerFn(i) } for e := range cluster.events[0].CommandCommitted { if log.V(1) { log.Infof(" : recv %s", e.CommandID) } if fmt.Sprintf(cmdIDFormat, numCommit) == e.CommandID { log.Infof("received everything we asked for, ending test") break } } close(teardown) // Because ending the test case is racy with the test itself, we wait until // all our goroutines have finished their work before we allow the test to // forcible terminate. This solves a race condition on `t`, which is // otherwise subject to concurrent access from our goroutine and the go // testing machinery. wg.Wait() }
// sendAttempt is invoked by Send and handles retry logic and cache eviction // for a call sent to a single range. It returns a retry status, which is Break // on success and either Break, Continue or Reset depending on error condition. // This method is expected to be invoked from within a backoff / retry loop to // retry the send repeatedly (e.g. to continue processing after a critical node // becomes available after downtime or the range descriptor is refreshed via // lookup). func (ds *DistSender) sendAttempt(desc *proto.RangeDescriptor, call proto.Call) (retry.Status, error) { leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) args := call.Args reply := call.Reply // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } err := ds.sendRPC(desc.RaftID, replicas, order, args, reply) if err != nil { // For an RPC error to occur, we must've been unable to contact any // replicas. In this case, likely all nodes are down (or not getting back // to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date replicas, so // clearing the descriptor here should be a good idea. // TODO(tschottdorf): If a replica group goes dead, this will cause clients // to put high read pressure on the first range, so there should be some // rate limiting here. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } else { err = reply.Header().GoError() } if err != nil { if log.V(1) { log.Warningf("failed to invoke %s: %s", call.Method(), err) } // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) // On addressing errors, don't backoff; retry immediately. return retry.Reset, err case *proto.NotLeaderError: newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader) return retry.Reset, err case util.Retryable: if tErr.CanRetry() { return retry.Continue, err } } return retry.Break, err } return retry.Break, nil }
func (m *LogEntry) Unmarshal(data []byte) error { l := len(data) iNdEx := 0 for iNdEx < l { var wire uint64 for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ wire |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) switch fieldNum { case 1: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Severity", wireType) } for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ m.Severity |= (int32(b) & 0x7F) << shift if b < 0x80 { break } } case 2: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Time", wireType) } for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ m.Time |= (int64(b) & 0x7F) << shift if b < 0x80 { break } } case 3: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field ThreadID", wireType) } for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ m.ThreadID |= (int32(b) & 0x7F) << shift if b < 0x80 { break } } case 4: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field File", wireType) } var stringLen uint64 for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ stringLen |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } postIndex := iNdEx + int(stringLen) if postIndex > l { return io.ErrUnexpectedEOF } m.File = string(data[iNdEx:postIndex]) iNdEx = postIndex case 5: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Line", wireType) } for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ m.Line |= (int32(b) & 0x7F) << shift if b < 0x80 { break } } case 6: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Format", wireType) } var stringLen uint64 for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ stringLen |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } postIndex := iNdEx + int(stringLen) if postIndex > l { return io.ErrUnexpectedEOF } m.Format = string(data[iNdEx:postIndex]) iNdEx = postIndex case 7: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Args", wireType) } var msglen int for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ msglen |= (int(b) & 0x7F) << shift if b < 0x80 { break } } postIndex := iNdEx + msglen if postIndex > l { return io.ErrUnexpectedEOF } m.Args = append(m.Args, LogEntry_Arg{}) if err := m.Args[len(m.Args)-1].Unmarshal(data[iNdEx:postIndex]); err != nil { return err } iNdEx = postIndex case 8: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field NodeID", wireType) } var v github_com_cockroachdb_cockroach_proto.NodeID for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ v |= (github_com_cockroachdb_cockroach_proto.NodeID(b) & 0x7F) << shift if b < 0x80 { break } } m.NodeID = &v case 9: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field StoreID", wireType) } var v github_com_cockroachdb_cockroach_proto.StoreID for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ v |= (github_com_cockroachdb_cockroach_proto.StoreID(b) & 0x7F) << shift if b < 0x80 { break } } m.StoreID = &v case 10: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field RaftID", wireType) } var v github_com_cockroachdb_cockroach_proto.RaftID for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ v |= (github_com_cockroachdb_cockroach_proto.RaftID(b) & 0x7F) << shift if b < 0x80 { break } } m.RaftID = &v case 11: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Method", wireType) } var v github_com_cockroachdb_cockroach_proto.Method for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ v |= (github_com_cockroachdb_cockroach_proto.Method(b) & 0x7F) << shift if b < 0x80 { break } } m.Method = &v case 12: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ byteLen |= (int(b) & 0x7F) << shift if b < 0x80 { break } } postIndex := iNdEx + byteLen if postIndex > l { return io.ErrUnexpectedEOF } m.Key = append([]byte{}, data[iNdEx:postIndex]...) iNdEx = postIndex case 13: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Stacks", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ byteLen |= (int(b) & 0x7F) << shift if b < 0x80 { break } } postIndex := iNdEx + byteLen if postIndex > l { return io.ErrUnexpectedEOF } m.Stacks = append([]byte{}, data[iNdEx:postIndex]...) iNdEx = postIndex default: var sizeOfWire int for { sizeOfWire++ wire >>= 7 if wire == 0 { break } } iNdEx -= sizeOfWire skippy, err := skipLog(data[iNdEx:]) if err != nil { return err } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } return nil }
func TestStoreRangeSet(t *testing.T) { defer leaktest.AfterTest(t) store, _, stopper := createTestStore(t) defer stopper.Stop() // Remove range 1. rng1, err := store.GetRange(1) if err != nil { t.Error(err) } if err := store.RemoveRange(rng1); err != nil { t.Error(err) } // Add 10 new ranges. const newCount = 10 for i := 0; i < newCount; i++ { rng := createRange(store, proto.RaftID(i+1), proto.Key(fmt.Sprintf("a%02d", i)), proto.Key(fmt.Sprintf("a%02d", i+1))) if err := store.AddRangeTest(rng); err != nil { t.Fatal(err) } } // Verify two passes of the visit. ranges := newStoreRangeSet(store) for pass := 0; pass < 2; pass++ { if ec := ranges.EstimatedCount(); ec != 10 { t.Errorf("expected 10 remaining; got %d", ec) } i := 1 ranges.Visit(func(rng *Range) bool { if rng.Desc().RaftID != proto.RaftID(i) { t.Errorf("expected range with Raft ID %d; got %v", i, rng) } if ec := ranges.EstimatedCount(); ec != 10-i { t.Errorf("expected %d remaining; got %d", 10-i, ec) } i++ return true }) if ec := ranges.EstimatedCount(); ec != 10 { t.Errorf("expected 10 remaining; got %d", ec) } } // Try visiting with an addition and a removal. visited := make(chan struct{}) updated := make(chan struct{}) done := make(chan struct{}) go func() { i := 1 ranges.Visit(func(rng *Range) bool { if i == 1 { if rng.Desc().RaftID != proto.RaftID(i) { t.Errorf("expected range with Raft ID %d; got %v", i, rng) } close(visited) <-updated } else { // The second range will be removed and skipped. if rng.Desc().RaftID != proto.RaftID(i+1) { t.Errorf("expected range with Raft ID %d; got %v", i+1, rng) } } i++ return true }) if i != 10 { t.Errorf("expected visit of 9 ranges, but got %v", i-1) } close(done) }() <-visited if ec := ranges.EstimatedCount(); ec != 9 { t.Errorf("expected 9 remaining; got %d", ec) } // Split the first range to insert a new range as second range. // The range is never visited with this iteration. rng := createRange(store, 11, proto.Key("a000"), proto.Key("a01")) if err = store.SplitRange(store.LookupRange(proto.Key("a00"), nil), rng); err != nil { t.Fatal(err) } // Estimated count will still be 9, as it's cached. if ec := ranges.EstimatedCount(); ec != 9 { t.Errorf("expected 9 remaining; got %d", ec) } // Now, remove the next range in the iteration and verify we skip the removed range. rng = store.LookupRange(proto.Key("a01"), nil) if rng.Desc().RaftID != 2 { t.Errorf("expected fetch of raftID=2; got %d", rng.Desc().RaftID) } if err := store.RemoveRange(rng); err != nil { t.Error(err) } if ec := ranges.EstimatedCount(); ec != 9 { t.Errorf("expected 9 remaining; got %d", ec) } close(updated) <-done }