// TestHeartbeatResponseFanout check 2 raft groups on the same node distribution, // but each group has different Term, heartbeat response from each group should // not disturb other group's Term or Leadership func TestHeartbeatResponseFanout(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() defer stopper.Stop() cluster := newTestCluster(nil, 3, stopper, t) groupID1 := proto.RangeID(1) cluster.createGroup(groupID1, 0, 3 /* replicas */) groupID2 := proto.RangeID(2) cluster.createGroup(groupID2, 0, 3 /* replicas */) leaderIndex := 0 cluster.elect(leaderIndex, groupID1) // GroupID2 will have 3 round of election, so it will have different // term with groupID1, but both leader on the same node. for i := 2; i >= 0; i-- { leaderIndex = i cluster.elect(leaderIndex, groupID2) } // Send a coalesced heartbeat. // Heartbeat response from groupID2 will have a big term than which from groupID1. cluster.nodes[0].coalescedHeartbeat() // Start submit a command to see if groupID1's leader changed? cluster.nodes[0].SubmitCommand(groupID1, makeCommandID(), []byte("command")) select { case _ = <-cluster.events[0].CommandCommitted: log.Infof("SubmitCommand succeed after Heartbeat Response fanout") case <-time.After(500 * time.Millisecond): t.Fatalf("No leader after Heartbeat Response fanout") } }
// sendAttempt gathers and rearranges the replicas, and makes an RPC call. func (ds *DistSender) sendAttempt(trace *tracer.Trace, ba proto.BatchRequest, desc *proto.RangeDescriptor) (*proto.BatchResponse, error) { defer trace.Epoch("sending RPC")() leader := ds.leaderCache.Lookup(proto.RangeID(desc.RangeID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsReadOnly(&ba) && ba.ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } // TODO(tschottdorf) &ba -> ba resp, err := ds.sendRPC(trace, desc.RangeID, replicas, order, &ba) if err != nil { return nil, err } // Untangle the error from the received response. br := resp.(*proto.BatchResponse) err = br.GoError() br.Error = nil return br, err }
// newTestRangeSet creates a new range set that has the count number of ranges. func newTestRangeSet(count int, t *testing.T) *testRangeSet { rs := &testRangeSet{rangesByKey: btree.New(64 /* degree */)} for i := 0; i < count; i++ { desc := &proto.RangeDescriptor{ RangeID: proto.RangeID(i), StartKey: proto.Key(fmt.Sprintf("%03d", i)), EndKey: proto.Key(fmt.Sprintf("%03d", i+1)), } // Initialize the range stat so the scanner can use it. rng := &Replica{ stats: &rangeStats{ raftID: desc.RangeID, MVCCStats: engine.MVCCStats{ KeyBytes: 1, ValBytes: 2, KeyCount: 1, LiveCount: 1, }, }, } if err := rng.setDesc(desc); err != nil { t.Fatal(err) } if exRngItem := rs.rangesByKey.ReplaceOrInsert(rng); exRngItem != nil { t.Fatalf("failed to insert range %s", rng) } } return rs }
// handleWriteReady converts a set of raft.Ready structs into a writeRequest // to be persisted, marks the group as writing and sends it to the writeTask. func (s *state) handleWriteReady(readyGroups map[uint64]raft.Ready) { if log.V(6) { log.Infof("node %v write ready, preparing request", s.nodeID) } writeRequest := newWriteRequest() for groupID, ready := range readyGroups { raftGroupID := proto.RangeID(groupID) g, ok := s.groups[raftGroupID] if !ok { if log.V(6) { log.Infof("dropping write request to group %d", groupID) } continue } g.writing = true gwr := &groupWriteRequest{} if !raft.IsEmptyHardState(ready.HardState) { gwr.state = ready.HardState } if !raft.IsEmptySnap(ready.Snapshot) { gwr.snapshot = ready.Snapshot } if len(ready.Entries) > 0 { gwr.entries = ready.Entries } writeRequest.groups[raftGroupID] = gwr } s.writeTask.in <- writeRequest }
func TestLeaderElectionEvent(t *testing.T) { defer leaktest.AfterTest(t) // Leader election events are fired when the leader commits an entry, not when it // issues a call for votes. stopper := stop.NewStopper() cluster := newTestCluster(nil, 3, stopper, t) defer stopper.Stop() groupID := proto.RangeID(1) cluster.createGroup(groupID, 0, 3) // Process a Ready with a new leader but no new commits. // This happens while an election is in progress. // This may be dirty, but it seems this is the only way to make testrace pass. cluster.nodes[1].callbackChan <- func() { cluster.nodes[1].maybeSendLeaderEvent(groupID, cluster.nodes[1].groups[groupID], &raft.Ready{ SoftState: &raft.SoftState{ Lead: 3, }, }) } // Trigger multiraft another round select cluster.tickers[1].Tick() // No events are sent. select { case e := <-cluster.events[1].LeaderElection: t.Fatalf("got unexpected event %v", e) case <-time.After(200 * time.Millisecond): } // Now there are new committed entries. A new leader always commits an entry // to conclude the election. entry := raftpb.Entry{ Index: 42, Term: 42, } // This may be dirty, but it seems this is the only way to make testrace pass. cluster.nodes[1].callbackChan <- func() { cluster.nodes[1].maybeSendLeaderEvent(groupID, cluster.nodes[1].groups[groupID], &raft.Ready{ Entries: []raftpb.Entry{entry}, CommittedEntries: []raftpb.Entry{entry}, }) } cluster.tickers[1].Tick() // Now we get an event. select { case e := <-cluster.events[1].LeaderElection: if !reflect.DeepEqual(e, &EventLeaderElection{ GroupID: groupID, NodeID: 3, Term: 42, }) { t.Errorf("election event did not match expectations: %+v", e) } case <-time.After(200 * time.Millisecond): t.Fatal("didn't get expected event") } }
// sendAttempt is invoked by Send. It temporarily truncates the arguments to // match the descriptor's EndKey (if necessary) and gathers and rearranges the // replicas before making a single attempt at sending the request. It returns // the result of sending the RPC; a potential error contained in the reply has // to be handled separately by the caller. func (ds *DistSender) sendAttempt(trace *tracer.Trace, args proto.Request, desc *proto.RangeDescriptor) (proto.Response, error) { defer trace.Epoch("sending RPC")() // Truncate the request to our current range, making sure not to // touch it unless we have to (it is illegal to send EndKey on // commands which do not operate on ranges). if endKey := args.Header().EndKey; endKey != nil && !endKey.Less(desc.EndKey) { defer func(k proto.Key) { args.Header().EndKey = k }(endKey) args.Header().EndKey = desc.EndKey } leader := ds.leaderCache.Lookup(proto.RangeID(desc.RangeID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } return ds.sendRPC(trace, desc.RangeID, replicas, order, args) }
func TestLocalSenderLookupReplica(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() defer stopper.Stop() ctx := storage.TestStoreContext manualClock := hlc.NewManualClock(0) ctx.Clock = hlc.NewClock(manualClock.UnixNano) ls := NewLocalSender() // Create two new stores with ranges we care about. var e [2]engine.Engine var s [2]*storage.Store ranges := []struct { storeID proto.StoreID start, end proto.Key }{ {2, proto.Key("a"), proto.Key("c")}, {3, proto.Key("x"), proto.Key("z")}, } for i, rng := range ranges { e[i] = engine.NewInMem(proto.Attributes{}, 1<<20) ctx.Transport = multiraft.NewLocalRPCTransport(stopper) defer ctx.Transport.Close() s[i] = storage.NewStore(ctx, e[i], &proto.NodeDescriptor{NodeID: 1}) s[i].Ident.StoreID = rng.storeID desc := &proto.RangeDescriptor{ RangeID: proto.RangeID(i), StartKey: rng.start, EndKey: rng.end, Replicas: []proto.Replica{{StoreID: rng.storeID}}, } newRng, err := storage.NewReplica(desc, s[i]) if err != nil { t.Fatal(err) } if err := s[i].AddRangeTest(newRng); err != nil { t.Error(err) } ls.AddStore(s[i]) } if _, r, err := ls.lookupReplica(proto.Key("a"), proto.Key("c")); r.StoreID != s[0].Ident.StoreID || err != nil { t.Errorf("expected store %d; got %d: %v", s[0].Ident.StoreID, r.StoreID, err) } if _, r, err := ls.lookupReplica(proto.Key("b"), nil); r.StoreID != s[0].Ident.StoreID || err != nil { t.Errorf("expected store %d; got %d: %v", s[0].Ident.StoreID, r.StoreID, err) } if _, r, err := ls.lookupReplica(proto.Key("b"), proto.Key("d")); r != nil || err == nil { t.Errorf("expected store 0 and error got %d", r.StoreID) } if _, r, err := ls.lookupReplica(proto.Key("x"), proto.Key("z")); r.StoreID != s[1].Ident.StoreID { t.Errorf("expected store %d; got %d: %v", s[1].Ident.StoreID, r.StoreID, err) } if _, r, err := ls.lookupReplica(proto.Key("y"), nil); r.StoreID != s[1].Ident.StoreID || err != nil { t.Errorf("expected store %d; got %d: %v", s[1].Ident.StoreID, r.StoreID, err) } }
// DecodeRaftStateKey extracts the Range ID from a RaftStateKey. func DecodeRaftStateKey(key proto.Key) proto.RangeID { if !bytes.HasPrefix(key, LocalRangeIDPrefix) { panic(fmt.Sprintf("key %q does not have %q prefix", key, LocalRangeIDPrefix)) } // Cut the prefix and the Range ID. b := key[len(LocalRangeIDPrefix):] _, rangeID := encoding.DecodeUvarint(b) return proto.RangeID(rangeID) }
func TestSlowStorage(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() cluster := newTestCluster(nil, 3, stopper, t) defer stopper.Stop() groupID := proto.RangeID(1) cluster.createGroup(groupID, 0, 3) cluster.triggerElection(0, groupID) // Block the storage on the last node. cluster.storages[2].Block() // Submit a command to the leader cluster.nodes[0].SubmitCommand(groupID, makeCommandID(), []byte("command")) // Even with the third node blocked, the other nodes can make progress. for i := 0; i < 2; i++ { events := cluster.events[i] log.Infof("waiting for event to be commited on node %v", i) commit := <-events.CommandCommitted if string(commit.Command) != "command" { t.Errorf("unexpected value in committed command: %v", commit.Command) } } // Ensure that node 2 is in fact blocked. time.Sleep(time.Millisecond) select { case commit := <-cluster.events[2].CommandCommitted: t.Errorf("didn't expect commits on node 2 but got %v", commit) default: } // After unblocking the third node, it will catch up. cluster.storages[2].Unblock() log.Infof("waiting for event to be commited on node 2") // When we unblock, the backlog is not guaranteed to be processed in order, // and in some cases the leader may need to retransmit some messages. for i := 0; i < 3; i++ { select { case commit := <-cluster.events[2].CommandCommitted: if string(commit.Command) != "command" { t.Errorf("unexpected value in committed command: %v", commit.Command) } return case <-time.After(5 * time.Millisecond): // Tick both node's clocks. The ticks on the follower node don't // really do anything, but they do ensure that that goroutine is // getting scheduled (and the real-time delay allows rpc responses // to pass between the nodes) cluster.tickers[0].Tick() cluster.tickers[2].Tick() } } }
func TestMembershipChange(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() cluster := newTestCluster(nil, 4, stopper, t) defer stopper.Stop() // Create a group with a single member, cluster.nodes[0]. groupID := proto.RangeID(1) cluster.createGroup(groupID, 0, 1) // An automatic election is triggered since this is a single-node Raft group, // so we don't need to call triggerElection. // Consume and apply the membership change events. for i := 0; i < 4; i++ { go func(i int) { for { e, ok := <-cluster.events[i].MembershipChangeCommitted if !ok { return } e.Callback(nil) } }(i) } // Add each of the other three nodes to the cluster. for i := 1; i < 4; i++ { ch := cluster.nodes[0].ChangeGroupMembership(groupID, makeCommandID(), raftpb.ConfChangeAddNode, cluster.nodes[i].nodeID, nil) <-ch } // TODO(bdarnell): verify that the channel events are sent out correctly. /* for i := 0; i < 10; i++ { log.Infof("tick %d", i) cluster.tickers[0].Tick() time.Sleep(5 * time.Millisecond) } // Each node is notified of each other node's joining. for i := 0; i < 4; i++ { for j := 1; j < 4; j++ { select { case e := <-cluster.events[i].MembershipChangeCommitted: if e.NodeID != cluster.nodes[j].nodeID { t.Errorf("node %d expected event for %d, got %d", i, j, e.NodeID) } default: t.Errorf("node %d did not get expected event for %d", i, j) } } }*/ }
// addRange adds a new range to the cluster but does not attach it to any // store. func (c *Cluster) addRange() *Range { rangeID := proto.RangeID(len(c.ranges)) newRng := newRange(rangeID, c.allocator) c.ranges[rangeID] = newRng // Save a sorted array of range IDs to avoid having to calculate them // multiple times. c.rangeIDs = append(c.rangeIDs, rangeID) sort.Sort(c.rangeIDs) return newRng }
// TestRaftAfterRemoveRange verifies that the MultiRaft state removes // a remote node correctly after the Replica was removed from the Store. func TestRaftAfterRemoveRange(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() // Make the split. splitArgs := adminSplitArgs(proto.KeyMin, []byte("b"), proto.RangeID(1), mtc.stores[0].StoreID()) if _, err := mtc.stores[0].ExecuteCmd(context.Background(), &splitArgs); err != nil { t.Fatal(err) } rangeID := proto.RangeID(2) mtc.replicateRange(rangeID, 0, 1, 2) mtc.unreplicateRange(rangeID, 0, 2) mtc.unreplicateRange(rangeID, 0, 1) // Wait for the removal to be processed. util.SucceedsWithin(t, time.Second, func() error { _, err := mtc.stores[1].GetReplica(rangeID) if _, ok := err.(*proto.RangeNotFoundError); ok { return nil } else if err != nil { return err } return util.Errorf("range still exists") }) if err := mtc.transport.Send(&multiraft.RaftMessageRequest{ GroupID: proto.RangeID(0), Message: raftpb.Message{ From: uint64(mtc.stores[2].RaftNodeID()), To: uint64(mtc.stores[1].RaftNodeID()), Type: raftpb.MsgHeartbeat, }}); err != nil { t.Fatal(err) } // Execute another replica change to ensure that MultiRaft has processed the heartbeat just sent. mtc.replicateRange(proto.RangeID(1), 0, 1) }
// TestRaftRemoveRace adds and removes a replica repeatedly in an // attempt to reproduce a race // (https://github.com/cockroachdb/cockroach/issues/1911). Note that // 10 repetitions is not enough to reliably reproduce the problem, but // it's better than any other tests we have for this (increasing the // number of repetitions adds an unacceptable amount of test runtime). func TestRaftRemoveRace(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() rangeID := proto.RangeID(1) mtc.replicateRange(rangeID, 0, 1, 2) for i := 0; i < 10; i++ { mtc.unreplicateRange(rangeID, 0, 2) mtc.replicateRange(rangeID, 0, 2) } }
// String prints out the current status of the cluster. func (c *Cluster) String() string { storesRangeCounts := make(map[proto.StoreID]int) for _, r := range c.ranges { for _, storeID := range r.getStoreIDs() { storesRangeCounts[storeID]++ } } var nodeIDs []int for nodeID := range c.nodes { nodeIDs = append(nodeIDs, int(nodeID)) } sort.Ints(nodeIDs) var buf bytes.Buffer buf.WriteString("Node Info:\n") for _, nodeID := range nodeIDs { n := c.nodes[proto.NodeID(nodeID)] buf.WriteString(n.String()) buf.WriteString("\n") } var storeIDs []int for storeID := range c.stores { storeIDs = append(storeIDs, int(storeID)) } sort.Ints(storeIDs) buf.WriteString("Store Info:\n") for _, storeID := range storeIDs { s := c.stores[proto.StoreID(storeID)] buf.WriteString(s.String(storesRangeCounts[proto.StoreID(storeID)])) buf.WriteString("\n") } var rangeIDs []int for rangeID := range c.ranges { rangeIDs = append(rangeIDs, int(rangeID)) } sort.Ints(rangeIDs) buf.WriteString("Range Info:\n") for _, rangeID := range rangeIDs { r := c.ranges[proto.RangeID(rangeID)] buf.WriteString(r.String()) buf.WriteString("\n") } return buf.String() }
// TestReplicateAfterSplit verifies that a new replica whose start key // is not KeyMin replicating to a fresh store can apply snapshots correctly. func TestReplicateAfterSplit(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 2) defer mtc.Stop() rangeID := proto.RangeID(1) splitKey := proto.Key("m") key := proto.Key("z") store0 := mtc.stores[0] // Make the split splitArgs := adminSplitArgs(proto.KeyMin, splitKey, rangeID, store0.StoreID()) if _, err := store0.ExecuteCmd(context.Background(), &splitArgs); err != nil { t.Fatal(err) } rangeID2 := store0.LookupReplica(key, nil).Desc().RangeID if rangeID2 == rangeID { t.Errorf("got same range id after split") } // Issue an increment for later check. incArgs := incrementArgs(key, 11, rangeID2, store0.StoreID()) if _, err := store0.ExecuteCmd(context.Background(), &incArgs); err != nil { t.Fatal(err) } // Now add the second replica. mtc.replicateRange(rangeID2, 0, 1) if mtc.stores[1].LookupReplica(key, nil).GetMaxBytes() == 0 { t.Error("Range MaxBytes is not set after snapshot applied") } // Once it catches up, the effects of increment commands can be seen. if err := util.IsTrueWithin(func() bool { getArgs := getArgs(key, rangeID2, mtc.stores[1].StoreID()) // Reading on non-leader replica should use inconsistent read getArgs.ReadConsistency = proto.INCONSISTENT reply, err := mtc.stores[1].ExecuteCmd(context.Background(), &getArgs) if err != nil { return false } getResp := reply.(*proto.GetResponse) if log.V(1) { log.Infof("read value %d", mustGetInt(getResp.Value)) } return mustGetInt(getResp.Value) == 11 }, 1*time.Second); err != nil { t.Fatal(err) } }
func TestInitialLeaderElection(t *testing.T) { defer leaktest.AfterTest(t) // Run the test three times, each time triggering a different node's election clock. // The node that requests an election first should win. for leaderIndex := 0; leaderIndex < 3; leaderIndex++ { log.Infof("testing leader election for node %v", leaderIndex) stopper := stop.NewStopper() cluster := newTestCluster(nil, 3, stopper, t) groupID := proto.RangeID(1) cluster.createGroup(groupID, 0, 3) cluster.elect(leaderIndex, groupID) stopper.Stop() } }
// TestProgressWithDownNode verifies that a surviving quorum can make progress // with a downed node. func TestProgressWithDownNode(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() rangeID := proto.RangeID(1) mtc.replicateRange(rangeID, 0, 1, 2) incArgs := incrementArgs([]byte("a"), 5, rangeID, mtc.stores[0].StoreID()) if _, err := mtc.stores[0].ExecuteCmd(context.Background(), &incArgs); err != nil { t.Fatal(err) } // Verify that the first increment propagates to all the engines. verify := func(expected []int64) { util.SucceedsWithin(t, time.Second, func() error { values := []int64{} for _, eng := range mtc.engines { val, _, err := engine.MVCCGet(eng, proto.Key("a"), mtc.clock.Now(), true, nil) if err != nil { return err } values = append(values, mustGetInt(val)) } if !reflect.DeepEqual(expected, values) { return util.Errorf("expected %v, got %v", expected, values) } return nil }) } verify([]int64{5, 5, 5}) // Stop one of the replicas and issue a new increment. mtc.stopStore(1) incArgs = incrementArgs([]byte("a"), 11, rangeID, mtc.stores[0].StoreID()) if _, err := mtc.stores[0].ExecuteCmd(context.Background(), &incArgs); err != nil { t.Fatal(err) } // The new increment can be seen on both live replicas. verify([]int64{16, 5, 16}) // Once the downed node is restarted, it will catch up. mtc.restartStore(1) verify([]int64{16, 16, 16}) }
// TestStoreRaftIDAllocation verifies that raft IDs are // allocated in successive blocks. func TestStoreRaftIDAllocation(t *testing.T) { defer leaktest.AfterTest(t) store, _, stopper := createTestStore(t) defer stopper.Stop() // Raft IDs should be allocated from ID 2 (first alloc'd range) // to raftIDAllocCount * 3 + 1. for i := 0; i < raftIDAllocCount*3; i++ { replicas := []proto.Replica{{StoreID: store.StoreID()}} desc, err := store.NewRangeDescriptor(proto.Key(fmt.Sprintf("%03d", i)), proto.Key(fmt.Sprintf("%03d", i+1)), replicas) if err != nil { t.Fatal(err) } if desc.RangeID != proto.RangeID(2+i) { t.Errorf("expected Raft id %d; got %d", 2+i, desc.RangeID) } } }
func TestCommand(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() cluster := newTestCluster(nil, 3, stopper, t) defer stopper.Stop() groupID := proto.RangeID(1) cluster.createGroup(groupID, 0, 3) cluster.triggerElection(0, groupID) // Submit a command to the leader cluster.nodes[0].SubmitCommand(groupID, makeCommandID(), []byte("command")) // The command will be committed on each node. for i, events := range cluster.events { log.Infof("waiting for event to be committed on node %v", i) commit := <-events.CommandCommitted if string(commit.Command) != "command" { t.Errorf("unexpected value in committed command: %v", commit.Command) } } }
// TestRangeGCQueueDropReplica verifies that a removed replica is // immediately cleaned up. func TestRangeGCQueueDropReplica(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() rangeID := proto.RangeID(1) mtc.replicateRange(rangeID, 0, 1, 2) mtc.unreplicateRange(rangeID, 0, 1) // Make sure the range is removed from the store. util.SucceedsWithin(t, time.Second, func() error { if _, err := mtc.stores[1].GetReplica(rangeID); !testutils.IsError(err, "range .* was not found") { return util.Errorf("expected range removal") } return nil }) // Restart the store to tear down the test cleanly. mtc.stopStore(1) mtc.restartStore(1) }
// TestRangeGCQueueDropReplicaOnScan verifies that the range GC queue // removes a range from a store that no longer should have a replica. func TestRangeGCQueueDropReplicaGCOnScan(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() // Disable the range gc queue to prevent direct removal of range. mtc.stores[1].DisableRangeGCQueue(true) rangeID := proto.RangeID(1) mtc.replicateRange(rangeID, 0, 1, 2) mtc.unreplicateRange(rangeID, 0, 1) // Wait long enough for the direct range GC to have had a chance and been // discarded because the queue is disabled. time.Sleep(10 * time.Millisecond) if _, err := mtc.stores[1].GetReplica(rangeID); err != nil { t.Error("unexpected range removal") } // Enable the queue. mtc.stores[1].DisableRangeGCQueue(false) // Increment the clock's timestamp to make the range GC queue process the range. mtc.manualClock.Increment(int64(storage.RangeGCQueueInactivityThreshold+storage.DefaultLeaderLeaseDuration) + 1) // Make sure the range is removed from the store. util.SucceedsWithin(t, time.Second, func() error { store := mtc.stores[1] store.ForceRangeGCScan(t) if _, err := store.GetReplica(rangeID); !testutils.IsError(err, "range .* was not found") { return util.Errorf("expected range removal: %s", err) } return nil }) // Restart the store to tear down the test cleanly. mtc.stopStore(1) mtc.restartStore(1) }
func (m *LogEntry) Unmarshal(data []byte) error { l := len(data) iNdEx := 0 for iNdEx < l { var wire uint64 for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ wire |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) switch fieldNum { case 1: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Severity", wireType) } for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ m.Severity |= (int32(b) & 0x7F) << shift if b < 0x80 { break } } case 2: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Time", wireType) } for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ m.Time |= (int64(b) & 0x7F) << shift if b < 0x80 { break } } case 3: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field ThreadID", wireType) } for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ m.ThreadID |= (int32(b) & 0x7F) << shift if b < 0x80 { break } } case 4: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field File", wireType) } var stringLen uint64 for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ stringLen |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } postIndex := iNdEx + int(stringLen) if postIndex > l { return io.ErrUnexpectedEOF } m.File = string(data[iNdEx:postIndex]) iNdEx = postIndex case 5: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Line", wireType) } for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ m.Line |= (int32(b) & 0x7F) << shift if b < 0x80 { break } } case 6: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Format", wireType) } var stringLen uint64 for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ stringLen |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } postIndex := iNdEx + int(stringLen) if postIndex > l { return io.ErrUnexpectedEOF } m.Format = string(data[iNdEx:postIndex]) iNdEx = postIndex case 7: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Args", wireType) } var msglen int for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ msglen |= (int(b) & 0x7F) << shift if b < 0x80 { break } } postIndex := iNdEx + msglen if postIndex > l { return io.ErrUnexpectedEOF } m.Args = append(m.Args, LogEntry_Arg{}) if err := m.Args[len(m.Args)-1].Unmarshal(data[iNdEx:postIndex]); err != nil { return err } iNdEx = postIndex case 8: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field NodeID", wireType) } var v github_com_cockroachdb_cockroach_proto.NodeID for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ v |= (github_com_cockroachdb_cockroach_proto.NodeID(b) & 0x7F) << shift if b < 0x80 { break } } m.NodeID = &v case 9: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field StoreID", wireType) } var v github_com_cockroachdb_cockroach_proto.StoreID for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ v |= (github_com_cockroachdb_cockroach_proto.StoreID(b) & 0x7F) << shift if b < 0x80 { break } } m.StoreID = &v case 10: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field RangeID", wireType) } var v github_com_cockroachdb_cockroach_proto.RangeID for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ v |= (github_com_cockroachdb_cockroach_proto.RangeID(b) & 0x7F) << shift if b < 0x80 { break } } m.RangeID = &v case 11: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Method", wireType) } var v github_com_cockroachdb_cockroach_proto.Method for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ v |= (github_com_cockroachdb_cockroach_proto.Method(b) & 0x7F) << shift if b < 0x80 { break } } m.Method = &v case 12: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ byteLen |= (int(b) & 0x7F) << shift if b < 0x80 { break } } postIndex := iNdEx + byteLen if postIndex > l { return io.ErrUnexpectedEOF } m.Key = append([]byte{}, data[iNdEx:postIndex]...) iNdEx = postIndex case 13: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Stacks", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if iNdEx >= l { return io.ErrUnexpectedEOF } b := data[iNdEx] iNdEx++ byteLen |= (int(b) & 0x7F) << shift if b < 0x80 { break } } postIndex := iNdEx + byteLen if postIndex > l { return io.ErrUnexpectedEOF } m.Stacks = append([]byte{}, data[iNdEx:postIndex]...) iNdEx = postIndex default: var sizeOfWire int for { sizeOfWire++ wire >>= 7 if wire == 0 { break } } iNdEx -= sizeOfWire skippy, err := skipLog(data[iNdEx:]) if err != nil { return err } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } return nil }
func (c *Cluster) splitRangeLast() { rangeID := proto.RangeID(len(c.ranges) - 1) c.splitRange(rangeID) }
// addRange adds a new range to the cluster but does not attach it to any // store. func (c *Cluster) addRange() *Range { rangeID := proto.RangeID(len(c.ranges)) newRng := newRange(rangeID) c.ranges[rangeID] = newRng return newRng }
// TestRemoveLeader ensures that a group will recover if a node is // removed from the group while it is leader. Since visibility into // the raft state is limited, we create a three-node group in a // six-node cluster. This group is migrated one node at a time from // the first three nodes to the last three. In the process the initial // leader must have removed itself. func TestRemoveLeader(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() const clusterSize = 6 const groupSize = 3 cluster := newTestCluster(nil, clusterSize, stopper, t) defer stopper.Stop() // Consume and apply the membership change events. for i := 0; i < clusterSize; i++ { go func(i int) { for { if e, ok := <-cluster.events[i].MembershipChangeCommitted; ok { e.Callback(nil) } else { return } } }(i) } // Tick all the clocks in the background to ensure that all the // necessary elections are triggered. // TODO(bdarnell): newTestCluster should have an option to use a // real clock instead of a manual one. stopper.RunWorker(func() { ticker := time.NewTicker(10 * time.Millisecond) defer ticker.Stop() for { select { case <-stopper.ShouldStop(): return case <-ticker.C: for _, t := range cluster.tickers { t.NonBlockingTick() } } } }) // Create a group with three members. groupID := proto.RangeID(1) cluster.createGroup(groupID, 0, groupSize) // Move the group one node at a time from the first three nodes to // the last three. In the process, we necessarily remove the leader // and trigger at least one new election among the new nodes. for i := 0; i < groupSize; i++ { log.Infof("adding node %d", i+groupSize) ch := cluster.nodes[i].ChangeGroupMembership(groupID, makeCommandID(), raftpb.ConfChangeAddNode, cluster.nodes[i+groupSize].nodeID, nil) if err := <-ch; err != nil { t.Fatal(err) } log.Infof("removing node %d", i) ch = cluster.nodes[i].ChangeGroupMembership(groupID, makeCommandID(), raftpb.ConfChangeRemoveNode, cluster.nodes[i].nodeID, nil) if err := <-ch; err != nil { t.Fatal(err) } } }
func testContext() context.Context { ctx := context.Background() return Add(ctx, NodeID, proto.NodeID(1), StoreID, proto.StoreID(2), RangeID, proto.RangeID(3), Method, proto.Get, Key, proto.Key("key")) }
import ( "errors" "fmt" "time" "github.com/cockroachdb/cockroach/proto" "github.com/cockroachdb/cockroach/util" "github.com/cockroachdb/cockroach/util/log" "github.com/cockroachdb/cockroach/util/stop" "github.com/coreos/etcd/raft" "github.com/coreos/etcd/raft/raftpb" "golang.org/x/net/context" ) const ( noGroup = proto.RangeID(0) reqBufferSize = 100 ) // An ErrGroupDeleted is returned for commands which are pending while their // group is deleted. var ErrGroupDeleted = errors.New("raft group deleted") // ErrStopped is returned for commands that could not be completed before the // node was stopped. var ErrStopped = errors.New("raft processing stopped") // Config contains the parameters necessary to construct a MultiRaft object. type Config struct { Storage Storage
func (c *Cluster) splitRangeRandom() { rangeID := proto.RangeID(c.rand.Int63n(int64(len(c.ranges)))) c.splitRange(rangeID) }
func TestRapidMembershipChange(t *testing.T) { defer leaktest.AfterTest(t) stopper := stop.NewStopper() defer stopper.Stop() var wg sync.WaitGroup proposers := 5 numCommit := int32(200) cluster := newTestCluster(nil, 1, stopper, t) groupID := proto.RangeID(1) cluster.createGroup(groupID, 0, 1 /* replicas */) startSeq := int32(0) // updated atomically from now on cmdIDFormat := "%0" + fmt.Sprintf("%d", commandIDLen) + "d" teardown := make(chan struct{}) proposerFn := func(i int) { defer wg.Done() var seq int32 for { seq = atomic.AddInt32(&startSeq, 1) if seq > numCommit { break } cmdID := fmt.Sprintf(cmdIDFormat, seq) retry: for { if err := cluster.nodes[0].CreateGroup(groupID); err != nil { t.Fatal(err) } if log.V(1) { log.Infof("%-3d: try %s", i, cmdID) } select { case err := <-cluster.nodes[0].SubmitCommand(groupID, cmdID, []byte("command")): if err == nil { log.Infof("%-3d: ok %s", i, cmdID) break retry } log.Infof("%-3d: err %s %s", i, cmdID, err) case <-teardown: return } } if err := cluster.nodes[0].RemoveGroup(groupID); err != nil { t.Fatal(err) } } } for i := 0; i < proposers; i++ { wg.Add(1) go proposerFn(i) } for e := range cluster.events[0].CommandCommitted { if log.V(1) { log.Infof(" : recv %s", e.CommandID) } if fmt.Sprintf(cmdIDFormat, numCommit) == e.CommandID { log.Infof("received everything we asked for, ending test") break } } close(teardown) // Because ending the test case is racy with the test itself, we wait until // all our goroutines have finished their work before we allow the test to // forcible terminate. This solves a race condition on `t`, which is // otherwise subject to concurrent access from our goroutine and the go // testing machinery. wg.Wait() }
// handleWriteResponse updates the state machine and sends messages for a raft Ready batch. func (s *state) handleWriteResponse(response *writeResponse, readyGroups map[uint64]raft.Ready) { if log.V(6) { log.Infof("node %v got write response: %#v", s.nodeID, *response) } // Everything has been written to disk; now we can apply updates to the state machine // and send outgoing messages. for groupID, ready := range readyGroups { raftGroupID := proto.RangeID(groupID) g, ok := s.groups[raftGroupID] if !ok { if log.V(4) { log.Infof("dropping stale write to group %v", groupID) } continue } else if !g.writing { if log.V(4) { log.Infof("dropping stale write to reincarnation of group %v", groupID) } delete(readyGroups, groupID) // they must not make it to Advance. continue } g.writing = false // Process committed entries. for _, entry := range ready.CommittedEntries { commandID := s.processCommittedEntry(raftGroupID, g, entry) // TODO(bdarnell): the command is now committed, but not applied until the // application consumes EventCommandCommitted. Is returning via the channel // at this point useful or do we need to wait for the command to be // applied too? // This could be done with a Callback as in EventMembershipChangeCommitted // or perhaps we should move away from a channel to a callback-based system. s.removePending(g, g.pending[commandID], nil /* err */) } if !raft.IsEmptySnap(ready.Snapshot) { // Sync the group/node mapping with the information contained in the snapshot. for _, nodeID := range ready.Snapshot.Metadata.ConfState.Nodes { // TODO(bdarnell): if we had any information that predated this snapshot // we must remove those nodes. if err := s.addNode(proto.RaftNodeID(nodeID), g); err != nil { log.Errorf("node %v: error adding node %v", s.nodeID, nodeID) } } } // Process SoftState and leader changes. s.maybeSendLeaderEvent(raftGroupID, g, &ready) // Send all messages. for _, msg := range ready.Messages { switch msg.Type { case raftpb.MsgHeartbeat: if log.V(8) { log.Infof("node %v dropped individual heartbeat to node %v", s.nodeID, msg.To) } case raftpb.MsgHeartbeatResp: if log.V(8) { log.Infof("node %v dropped individual heartbeat response to node %v", s.nodeID, msg.To) } default: s.sendMessage(g, msg) } } } }