예제 #1
// TestHeartbeatResponseFanout check 2 raft groups on the same node distribution,
// but each group has different Term, heartbeat response from each group should
// not disturb other group's Term or Leadership
func TestHeartbeatResponseFanout(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	defer stopper.Stop()

	cluster := newTestCluster(nil, 3, stopper, t)
	groupID1 := proto.RangeID(1)
	cluster.createGroup(groupID1, 0, 3 /* replicas */)

	groupID2 := proto.RangeID(2)
	cluster.createGroup(groupID2, 0, 3 /* replicas */)

	leaderIndex := 0

	cluster.elect(leaderIndex, groupID1)
	// GroupID2 will have 3 round of election, so it will have different
	// term with groupID1, but both leader on the same node.
	for i := 2; i >= 0; i-- {
		leaderIndex = i
		cluster.elect(leaderIndex, groupID2)
	// Send a coalesced heartbeat.
	// Heartbeat response from groupID2 will have a big term than which from groupID1.
	// Start submit a command to see if groupID1's leader changed?
	cluster.nodes[0].SubmitCommand(groupID1, makeCommandID(), []byte("command"))

	select {
	case _ = <-cluster.events[0].CommandCommitted:
		log.Infof("SubmitCommand succeed after Heartbeat Response fanout")
	case <-time.After(500 * time.Millisecond):
		t.Fatalf("No leader after Heartbeat Response fanout")
예제 #2
// sendAttempt gathers and rearranges the replicas, and makes an RPC call.
func (ds *DistSender) sendAttempt(trace *tracer.Trace, ba proto.BatchRequest, desc *proto.RangeDescriptor) (*proto.BatchResponse, error) {
	defer trace.Epoch("sending RPC")()

	leader := ds.leaderCache.Lookup(proto.RangeID(desc.RangeID))

	// Try to send the call.
	replicas := newReplicaSlice(ds.gossip, desc)

	// Rearrange the replicas so that those replicas with long common
	// prefix of attributes end up first. If there's no prefix, this is a
	// no-op.
	order := ds.optimizeReplicaOrder(replicas)

	// If this request needs to go to a leader and we know who that is, move
	// it to the front.
	if !(proto.IsReadOnly(&ba) && ba.ReadConsistency == proto.INCONSISTENT) &&
		leader.StoreID > 0 {
		if i := replicas.FindReplica(leader.StoreID); i >= 0 {
			order = rpc.OrderStable

	// TODO(tschottdorf) &ba -> ba
	resp, err := ds.sendRPC(trace, desc.RangeID, replicas, order, &ba)
	if err != nil {
		return nil, err
	// Untangle the error from the received response.
	br := resp.(*proto.BatchResponse)
	err = br.GoError()
	br.Error = nil
	return br, err
예제 #3
// newTestRangeSet creates a new range set that has the count number of ranges.
func newTestRangeSet(count int, t *testing.T) *testRangeSet {
	rs := &testRangeSet{rangesByKey: btree.New(64 /* degree */)}
	for i := 0; i < count; i++ {
		desc := &proto.RangeDescriptor{
			RangeID:  proto.RangeID(i),
			StartKey: proto.Key(fmt.Sprintf("%03d", i)),
			EndKey:   proto.Key(fmt.Sprintf("%03d", i+1)),
		// Initialize the range stat so the scanner can use it.
		rng := &Replica{
			stats: &rangeStats{
				raftID: desc.RangeID,
				MVCCStats: engine.MVCCStats{
					KeyBytes:  1,
					ValBytes:  2,
					KeyCount:  1,
					LiveCount: 1,
		if err := rng.setDesc(desc); err != nil {
		if exRngItem := rs.rangesByKey.ReplaceOrInsert(rng); exRngItem != nil {
			t.Fatalf("failed to insert range %s", rng)
	return rs
예제 #4
// handleWriteReady converts a set of raft.Ready structs into a writeRequest
// to be persisted, marks the group as writing and sends it to the writeTask.
func (s *state) handleWriteReady(readyGroups map[uint64]raft.Ready) {
	if log.V(6) {
		log.Infof("node %v write ready, preparing request", s.nodeID)
	writeRequest := newWriteRequest()
	for groupID, ready := range readyGroups {
		raftGroupID := proto.RangeID(groupID)
		g, ok := s.groups[raftGroupID]
		if !ok {
			if log.V(6) {
				log.Infof("dropping write request to group %d", groupID)
		g.writing = true

		gwr := &groupWriteRequest{}
		if !raft.IsEmptyHardState(ready.HardState) {
			gwr.state = ready.HardState
		if !raft.IsEmptySnap(ready.Snapshot) {
			gwr.snapshot = ready.Snapshot
		if len(ready.Entries) > 0 {
			gwr.entries = ready.Entries
		writeRequest.groups[raftGroupID] = gwr
	s.writeTask.in <- writeRequest
예제 #5
func TestLeaderElectionEvent(t *testing.T) {
	defer leaktest.AfterTest(t)
	// Leader election events are fired when the leader commits an entry, not when it
	// issues a call for votes.
	stopper := stop.NewStopper()
	cluster := newTestCluster(nil, 3, stopper, t)
	defer stopper.Stop()
	groupID := proto.RangeID(1)
	cluster.createGroup(groupID, 0, 3)

	// Process a Ready with a new leader but no new commits.
	// This happens while an election is in progress.
	// This may be dirty, but it seems this is the only way to make testrace pass.
	cluster.nodes[1].callbackChan <- func() {
		cluster.nodes[1].maybeSendLeaderEvent(groupID, cluster.nodes[1].groups[groupID],
				SoftState: &raft.SoftState{
					Lead: 3,
	// Trigger multiraft another round select
	// No events are sent.
	select {
	case e := <-cluster.events[1].LeaderElection:
		t.Fatalf("got unexpected event %v", e)
	case <-time.After(200 * time.Millisecond):

	// Now there are new committed entries. A new leader always commits an entry
	// to conclude the election.
	entry := raftpb.Entry{
		Index: 42,
		Term:  42,
	// This may be dirty, but it seems this is the only way to make testrace pass.
	cluster.nodes[1].callbackChan <- func() {
		cluster.nodes[1].maybeSendLeaderEvent(groupID, cluster.nodes[1].groups[groupID],
				Entries:          []raftpb.Entry{entry},
				CommittedEntries: []raftpb.Entry{entry},

	// Now we get an event.
	select {
	case e := <-cluster.events[1].LeaderElection:
		if !reflect.DeepEqual(e, &EventLeaderElection{
			GroupID: groupID,
			NodeID:  3,
			Term:    42,
		}) {
			t.Errorf("election event did not match expectations: %+v", e)
	case <-time.After(200 * time.Millisecond):
		t.Fatal("didn't get expected event")
예제 #6
// sendAttempt is invoked by Send. It temporarily truncates the arguments to
// match the descriptor's EndKey (if necessary) and gathers and rearranges the
// replicas before making a single attempt at sending the request. It returns
// the result of sending the RPC; a potential error contained in the reply has
// to be handled separately by the caller.
func (ds *DistSender) sendAttempt(trace *tracer.Trace, args proto.Request, desc *proto.RangeDescriptor) (proto.Response, error) {
	defer trace.Epoch("sending RPC")()
	// Truncate the request to our current range, making sure not to
	// touch it unless we have to (it is illegal to send EndKey on
	// commands which do not operate on ranges).
	if endKey := args.Header().EndKey; endKey != nil && !endKey.Less(desc.EndKey) {
		defer func(k proto.Key) { args.Header().EndKey = k }(endKey)
		args.Header().EndKey = desc.EndKey
	leader := ds.leaderCache.Lookup(proto.RangeID(desc.RangeID))

	// Try to send the call.
	replicas := newReplicaSlice(ds.gossip, desc)

	// Rearrange the replicas so that those replicas with long common
	// prefix of attributes end up first. If there's no prefix, this is a
	// no-op.
	order := ds.optimizeReplicaOrder(replicas)

	// If this request needs to go to a leader and we know who that is, move
	// it to the front.
	if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) &&
		leader.StoreID > 0 {
		if i := replicas.FindReplica(leader.StoreID); i >= 0 {
			order = rpc.OrderStable

	return ds.sendRPC(trace, desc.RangeID, replicas, order, args)
예제 #7
func TestLocalSenderLookupReplica(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	defer stopper.Stop()
	ctx := storage.TestStoreContext
	manualClock := hlc.NewManualClock(0)
	ctx.Clock = hlc.NewClock(manualClock.UnixNano)
	ls := NewLocalSender()

	// Create two new stores with ranges we care about.
	var e [2]engine.Engine
	var s [2]*storage.Store
	ranges := []struct {
		storeID    proto.StoreID
		start, end proto.Key
		{2, proto.Key("a"), proto.Key("c")},
		{3, proto.Key("x"), proto.Key("z")},
	for i, rng := range ranges {
		e[i] = engine.NewInMem(proto.Attributes{}, 1<<20)
		ctx.Transport = multiraft.NewLocalRPCTransport(stopper)
		defer ctx.Transport.Close()
		s[i] = storage.NewStore(ctx, e[i], &proto.NodeDescriptor{NodeID: 1})
		s[i].Ident.StoreID = rng.storeID

		desc := &proto.RangeDescriptor{
			RangeID:  proto.RangeID(i),
			StartKey: rng.start,
			EndKey:   rng.end,
			Replicas: []proto.Replica{{StoreID: rng.storeID}},
		newRng, err := storage.NewReplica(desc, s[i])
		if err != nil {
		if err := s[i].AddRangeTest(newRng); err != nil {

	if _, r, err := ls.lookupReplica(proto.Key("a"), proto.Key("c")); r.StoreID != s[0].Ident.StoreID || err != nil {
		t.Errorf("expected store %d; got %d: %v", s[0].Ident.StoreID, r.StoreID, err)
	if _, r, err := ls.lookupReplica(proto.Key("b"), nil); r.StoreID != s[0].Ident.StoreID || err != nil {
		t.Errorf("expected store %d; got %d: %v", s[0].Ident.StoreID, r.StoreID, err)
	if _, r, err := ls.lookupReplica(proto.Key("b"), proto.Key("d")); r != nil || err == nil {
		t.Errorf("expected store 0 and error got %d", r.StoreID)
	if _, r, err := ls.lookupReplica(proto.Key("x"), proto.Key("z")); r.StoreID != s[1].Ident.StoreID {
		t.Errorf("expected store %d; got %d: %v", s[1].Ident.StoreID, r.StoreID, err)
	if _, r, err := ls.lookupReplica(proto.Key("y"), nil); r.StoreID != s[1].Ident.StoreID || err != nil {
		t.Errorf("expected store %d; got %d: %v", s[1].Ident.StoreID, r.StoreID, err)
예제 #8
// DecodeRaftStateKey extracts the Range ID from a RaftStateKey.
func DecodeRaftStateKey(key proto.Key) proto.RangeID {
	if !bytes.HasPrefix(key, LocalRangeIDPrefix) {
		panic(fmt.Sprintf("key %q does not have %q prefix", key, LocalRangeIDPrefix))
	// Cut the prefix and the Range ID.
	b := key[len(LocalRangeIDPrefix):]
	_, rangeID := encoding.DecodeUvarint(b)
	return proto.RangeID(rangeID)
예제 #9
func TestSlowStorage(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	cluster := newTestCluster(nil, 3, stopper, t)
	defer stopper.Stop()
	groupID := proto.RangeID(1)
	cluster.createGroup(groupID, 0, 3)
	cluster.triggerElection(0, groupID)

	// Block the storage on the last node.

	// Submit a command to the leader
	cluster.nodes[0].SubmitCommand(groupID, makeCommandID(), []byte("command"))

	// Even with the third node blocked, the other nodes can make progress.
	for i := 0; i < 2; i++ {
		events := cluster.events[i]
		log.Infof("waiting for event to be commited on node %v", i)
		commit := <-events.CommandCommitted
		if string(commit.Command) != "command" {
			t.Errorf("unexpected value in committed command: %v", commit.Command)

	// Ensure that node 2 is in fact blocked.
	select {
	case commit := <-cluster.events[2].CommandCommitted:
		t.Errorf("didn't expect commits on node 2 but got %v", commit)

	// After unblocking the third node, it will catch up.
	log.Infof("waiting for event to be commited on node 2")
	// When we unblock, the backlog is not guaranteed to be processed in order,
	// and in some cases the leader may need to retransmit some messages.
	for i := 0; i < 3; i++ {
		select {
		case commit := <-cluster.events[2].CommandCommitted:
			if string(commit.Command) != "command" {
				t.Errorf("unexpected value in committed command: %v", commit.Command)

		case <-time.After(5 * time.Millisecond):
			// Tick both node's clocks. The ticks on the follower node don't
			// really do anything, but they do ensure that that goroutine is
			// getting scheduled (and the real-time delay allows rpc responses
			// to pass between the nodes)
예제 #10
func TestMembershipChange(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	cluster := newTestCluster(nil, 4, stopper, t)
	defer stopper.Stop()

	// Create a group with a single member, cluster.nodes[0].
	groupID := proto.RangeID(1)
	cluster.createGroup(groupID, 0, 1)
	// An automatic election is triggered since this is a single-node Raft group,
	// so we don't need to call triggerElection.

	// Consume and apply the membership change events.
	for i := 0; i < 4; i++ {
		go func(i int) {
			for {
				e, ok := <-cluster.events[i].MembershipChangeCommitted
				if !ok {

	// Add each of the other three nodes to the cluster.
	for i := 1; i < 4; i++ {
		ch := cluster.nodes[0].ChangeGroupMembership(groupID, makeCommandID(),
			cluster.nodes[i].nodeID, nil)

	// TODO(bdarnell): verify that the channel events are sent out correctly.
		for i := 0; i < 10; i++ {
			log.Infof("tick %d", i)
			time.Sleep(5 * time.Millisecond)

		// Each node is notified of each other node's joining.
		for i := 0; i < 4; i++ {
			for j := 1; j < 4; j++ {
				select {
				case e := <-cluster.events[i].MembershipChangeCommitted:
					if e.NodeID != cluster.nodes[j].nodeID {
						t.Errorf("node %d expected event for %d, got %d", i, j, e.NodeID)
					t.Errorf("node %d did not get expected event for %d", i, j)
예제 #11
// addRange adds a new range to the cluster but does not attach it to any
// store.
func (c *Cluster) addRange() *Range {
	rangeID := proto.RangeID(len(c.ranges))
	newRng := newRange(rangeID, c.allocator)
	c.ranges[rangeID] = newRng

	// Save a sorted array of range IDs to avoid having to calculate them
	// multiple times.
	c.rangeIDs = append(c.rangeIDs, rangeID)

	return newRng
예제 #12
// TestRaftAfterRemoveRange verifies that the MultiRaft state removes
// a remote node correctly after the Replica was removed from the Store.
func TestRaftAfterRemoveRange(t *testing.T) {
	defer leaktest.AfterTest(t)
	mtc := startMultiTestContext(t, 3)
	defer mtc.Stop()

	// Make the split.
	splitArgs := adminSplitArgs(proto.KeyMin, []byte("b"), proto.RangeID(1), mtc.stores[0].StoreID())
	if _, err := mtc.stores[0].ExecuteCmd(context.Background(), &splitArgs); err != nil {

	rangeID := proto.RangeID(2)
	mtc.replicateRange(rangeID, 0, 1, 2)

	mtc.unreplicateRange(rangeID, 0, 2)
	mtc.unreplicateRange(rangeID, 0, 1)

	// Wait for the removal to be processed.
	util.SucceedsWithin(t, time.Second, func() error {
		_, err := mtc.stores[1].GetReplica(rangeID)
		if _, ok := err.(*proto.RangeNotFoundError); ok {
			return nil
		} else if err != nil {
			return err
		return util.Errorf("range still exists")

	if err := mtc.transport.Send(&multiraft.RaftMessageRequest{
		GroupID: proto.RangeID(0),
		Message: raftpb.Message{
			From: uint64(mtc.stores[2].RaftNodeID()),
			To:   uint64(mtc.stores[1].RaftNodeID()),
			Type: raftpb.MsgHeartbeat,
		}}); err != nil {
	// Execute another replica change to ensure that MultiRaft has processed the heartbeat just sent.
	mtc.replicateRange(proto.RangeID(1), 0, 1)
예제 #13
// TestRaftRemoveRace adds and removes a replica repeatedly in an
// attempt to reproduce a race
// (https://github.com/cockroachdb/cockroach/issues/1911). Note that
// 10 repetitions is not enough to reliably reproduce the problem, but
// it's better than any other tests we have for this (increasing the
// number of repetitions adds an unacceptable amount of test runtime).
func TestRaftRemoveRace(t *testing.T) {
	defer leaktest.AfterTest(t)
	mtc := startMultiTestContext(t, 3)
	defer mtc.Stop()

	rangeID := proto.RangeID(1)
	mtc.replicateRange(rangeID, 0, 1, 2)

	for i := 0; i < 10; i++ {
		mtc.unreplicateRange(rangeID, 0, 2)
		mtc.replicateRange(rangeID, 0, 2)
예제 #14
// String prints out the current status of the cluster.
func (c *Cluster) String() string {
	storesRangeCounts := make(map[proto.StoreID]int)
	for _, r := range c.ranges {
		for _, storeID := range r.getStoreIDs() {

	var nodeIDs []int
	for nodeID := range c.nodes {
		nodeIDs = append(nodeIDs, int(nodeID))

	var buf bytes.Buffer
	buf.WriteString("Node Info:\n")
	for _, nodeID := range nodeIDs {
		n := c.nodes[proto.NodeID(nodeID)]

	var storeIDs []int
	for storeID := range c.stores {
		storeIDs = append(storeIDs, int(storeID))

	buf.WriteString("Store Info:\n")
	for _, storeID := range storeIDs {
		s := c.stores[proto.StoreID(storeID)]

	var rangeIDs []int
	for rangeID := range c.ranges {
		rangeIDs = append(rangeIDs, int(rangeID))

	buf.WriteString("Range Info:\n")
	for _, rangeID := range rangeIDs {
		r := c.ranges[proto.RangeID(rangeID)]

	return buf.String()
예제 #15
// TestReplicateAfterSplit verifies that a new replica whose start key
// is not KeyMin replicating to a fresh store can apply snapshots correctly.
func TestReplicateAfterSplit(t *testing.T) {
	defer leaktest.AfterTest(t)
	mtc := startMultiTestContext(t, 2)
	defer mtc.Stop()

	rangeID := proto.RangeID(1)
	splitKey := proto.Key("m")
	key := proto.Key("z")

	store0 := mtc.stores[0]
	// Make the split
	splitArgs := adminSplitArgs(proto.KeyMin, splitKey, rangeID, store0.StoreID())
	if _, err := store0.ExecuteCmd(context.Background(), &splitArgs); err != nil {

	rangeID2 := store0.LookupReplica(key, nil).Desc().RangeID
	if rangeID2 == rangeID {
		t.Errorf("got same range id after split")
	// Issue an increment for later check.
	incArgs := incrementArgs(key, 11, rangeID2, store0.StoreID())
	if _, err := store0.ExecuteCmd(context.Background(), &incArgs); err != nil {
	// Now add the second replica.
	mtc.replicateRange(rangeID2, 0, 1)

	if mtc.stores[1].LookupReplica(key, nil).GetMaxBytes() == 0 {
		t.Error("Range MaxBytes is not set after snapshot applied")
	// Once it catches up, the effects of increment commands can be seen.
	if err := util.IsTrueWithin(func() bool {
		getArgs := getArgs(key, rangeID2, mtc.stores[1].StoreID())
		// Reading on non-leader replica should use inconsistent read
		getArgs.ReadConsistency = proto.INCONSISTENT
		reply, err := mtc.stores[1].ExecuteCmd(context.Background(), &getArgs)
		if err != nil {
			return false
		getResp := reply.(*proto.GetResponse)
		if log.V(1) {
			log.Infof("read value %d", mustGetInt(getResp.Value))
		return mustGetInt(getResp.Value) == 11
	}, 1*time.Second); err != nil {
예제 #16
func TestInitialLeaderElection(t *testing.T) {
	defer leaktest.AfterTest(t)
	// Run the test three times, each time triggering a different node's election clock.
	// The node that requests an election first should win.
	for leaderIndex := 0; leaderIndex < 3; leaderIndex++ {
		log.Infof("testing leader election for node %v", leaderIndex)
		stopper := stop.NewStopper()
		cluster := newTestCluster(nil, 3, stopper, t)
		groupID := proto.RangeID(1)
		cluster.createGroup(groupID, 0, 3)

		cluster.elect(leaderIndex, groupID)
예제 #17
// TestProgressWithDownNode verifies that a surviving quorum can make progress
// with a downed node.
func TestProgressWithDownNode(t *testing.T) {
	defer leaktest.AfterTest(t)
	mtc := startMultiTestContext(t, 3)
	defer mtc.Stop()

	rangeID := proto.RangeID(1)
	mtc.replicateRange(rangeID, 0, 1, 2)

	incArgs := incrementArgs([]byte("a"), 5, rangeID, mtc.stores[0].StoreID())
	if _, err := mtc.stores[0].ExecuteCmd(context.Background(), &incArgs); err != nil {

	// Verify that the first increment propagates to all the engines.
	verify := func(expected []int64) {
		util.SucceedsWithin(t, time.Second, func() error {
			values := []int64{}
			for _, eng := range mtc.engines {
				val, _, err := engine.MVCCGet(eng, proto.Key("a"), mtc.clock.Now(), true, nil)
				if err != nil {
					return err
				values = append(values, mustGetInt(val))
			if !reflect.DeepEqual(expected, values) {
				return util.Errorf("expected %v, got %v", expected, values)
			return nil
	verify([]int64{5, 5, 5})

	// Stop one of the replicas and issue a new increment.
	incArgs = incrementArgs([]byte("a"), 11, rangeID, mtc.stores[0].StoreID())
	if _, err := mtc.stores[0].ExecuteCmd(context.Background(), &incArgs); err != nil {

	// The new increment can be seen on both live replicas.
	verify([]int64{16, 5, 16})

	// Once the downed node is restarted, it will catch up.
	verify([]int64{16, 16, 16})
예제 #18
// TestStoreRaftIDAllocation verifies that raft IDs are
// allocated in successive blocks.
func TestStoreRaftIDAllocation(t *testing.T) {
	defer leaktest.AfterTest(t)
	store, _, stopper := createTestStore(t)
	defer stopper.Stop()

	// Raft IDs should be allocated from ID 2 (first alloc'd range)
	// to raftIDAllocCount * 3 + 1.
	for i := 0; i < raftIDAllocCount*3; i++ {
		replicas := []proto.Replica{{StoreID: store.StoreID()}}
		desc, err := store.NewRangeDescriptor(proto.Key(fmt.Sprintf("%03d", i)), proto.Key(fmt.Sprintf("%03d", i+1)), replicas)
		if err != nil {
		if desc.RangeID != proto.RangeID(2+i) {
			t.Errorf("expected Raft id %d; got %d", 2+i, desc.RangeID)
예제 #19
func TestCommand(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	cluster := newTestCluster(nil, 3, stopper, t)
	defer stopper.Stop()
	groupID := proto.RangeID(1)
	cluster.createGroup(groupID, 0, 3)
	cluster.triggerElection(0, groupID)

	// Submit a command to the leader
	cluster.nodes[0].SubmitCommand(groupID, makeCommandID(), []byte("command"))

	// The command will be committed on each node.
	for i, events := range cluster.events {
		log.Infof("waiting for event to be committed on node %v", i)
		commit := <-events.CommandCommitted
		if string(commit.Command) != "command" {
			t.Errorf("unexpected value in committed command: %v", commit.Command)
예제 #20
// TestRangeGCQueueDropReplica verifies that a removed replica is
// immediately cleaned up.
func TestRangeGCQueueDropReplica(t *testing.T) {
	defer leaktest.AfterTest(t)

	mtc := startMultiTestContext(t, 3)
	defer mtc.Stop()

	rangeID := proto.RangeID(1)
	mtc.replicateRange(rangeID, 0, 1, 2)
	mtc.unreplicateRange(rangeID, 0, 1)

	// Make sure the range is removed from the store.
	util.SucceedsWithin(t, time.Second, func() error {
		if _, err := mtc.stores[1].GetReplica(rangeID); !testutils.IsError(err, "range .* was not found") {
			return util.Errorf("expected range removal")
		return nil

	// Restart the store to tear down the test cleanly.
예제 #21
// TestRangeGCQueueDropReplicaOnScan verifies that the range GC queue
// removes a range from a store that no longer should have a replica.
func TestRangeGCQueueDropReplicaGCOnScan(t *testing.T) {
	defer leaktest.AfterTest(t)

	mtc := startMultiTestContext(t, 3)
	defer mtc.Stop()
	// Disable the range gc queue to prevent direct removal of range.

	rangeID := proto.RangeID(1)
	mtc.replicateRange(rangeID, 0, 1, 2)
	mtc.unreplicateRange(rangeID, 0, 1)

	// Wait long enough for the direct range GC to have had a chance and been
	// discarded because the queue is disabled.
	time.Sleep(10 * time.Millisecond)
	if _, err := mtc.stores[1].GetReplica(rangeID); err != nil {
		t.Error("unexpected range removal")

	// Enable the queue.

	// Increment the clock's timestamp to make the range GC queue process the range.
	mtc.manualClock.Increment(int64(storage.RangeGCQueueInactivityThreshold+storage.DefaultLeaderLeaseDuration) + 1)

	// Make sure the range is removed from the store.
	util.SucceedsWithin(t, time.Second, func() error {
		store := mtc.stores[1]
		if _, err := store.GetReplica(rangeID); !testutils.IsError(err, "range .* was not found") {
			return util.Errorf("expected range removal: %s", err)
		return nil

	// Restart the store to tear down the test cleanly.
예제 #22
func (m *LogEntry) Unmarshal(data []byte) error {
	l := len(data)
	iNdEx := 0
	for iNdEx < l {
		var wire uint64
		for shift := uint(0); ; shift += 7 {
			if iNdEx >= l {
				return io.ErrUnexpectedEOF
			b := data[iNdEx]
			wire |= (uint64(b) & 0x7F) << shift
			if b < 0x80 {
		fieldNum := int32(wire >> 3)
		wireType := int(wire & 0x7)
		switch fieldNum {
		case 1:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field Severity", wireType)
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				m.Severity |= (int32(b) & 0x7F) << shift
				if b < 0x80 {
		case 2:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field Time", wireType)
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				m.Time |= (int64(b) & 0x7F) << shift
				if b < 0x80 {
		case 3:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field ThreadID", wireType)
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				m.ThreadID |= (int32(b) & 0x7F) << shift
				if b < 0x80 {
		case 4:
			if wireType != 2 {
				return fmt.Errorf("proto: wrong wireType = %d for field File", wireType)
			var stringLen uint64
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				stringLen |= (uint64(b) & 0x7F) << shift
				if b < 0x80 {
			postIndex := iNdEx + int(stringLen)
			if postIndex > l {
				return io.ErrUnexpectedEOF
			m.File = string(data[iNdEx:postIndex])
			iNdEx = postIndex
		case 5:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field Line", wireType)
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				m.Line |= (int32(b) & 0x7F) << shift
				if b < 0x80 {
		case 6:
			if wireType != 2 {
				return fmt.Errorf("proto: wrong wireType = %d for field Format", wireType)
			var stringLen uint64
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				stringLen |= (uint64(b) & 0x7F) << shift
				if b < 0x80 {
			postIndex := iNdEx + int(stringLen)
			if postIndex > l {
				return io.ErrUnexpectedEOF
			m.Format = string(data[iNdEx:postIndex])
			iNdEx = postIndex
		case 7:
			if wireType != 2 {
				return fmt.Errorf("proto: wrong wireType = %d for field Args", wireType)
			var msglen int
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				msglen |= (int(b) & 0x7F) << shift
				if b < 0x80 {
			postIndex := iNdEx + msglen
			if postIndex > l {
				return io.ErrUnexpectedEOF
			m.Args = append(m.Args, LogEntry_Arg{})
			if err := m.Args[len(m.Args)-1].Unmarshal(data[iNdEx:postIndex]); err != nil {
				return err
			iNdEx = postIndex
		case 8:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field NodeID", wireType)
			var v github_com_cockroachdb_cockroach_proto.NodeID
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				v |= (github_com_cockroachdb_cockroach_proto.NodeID(b) & 0x7F) << shift
				if b < 0x80 {
			m.NodeID = &v
		case 9:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field StoreID", wireType)
			var v github_com_cockroachdb_cockroach_proto.StoreID
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				v |= (github_com_cockroachdb_cockroach_proto.StoreID(b) & 0x7F) << shift
				if b < 0x80 {
			m.StoreID = &v
		case 10:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field RangeID", wireType)
			var v github_com_cockroachdb_cockroach_proto.RangeID
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				v |= (github_com_cockroachdb_cockroach_proto.RangeID(b) & 0x7F) << shift
				if b < 0x80 {
			m.RangeID = &v
		case 11:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field Method", wireType)
			var v github_com_cockroachdb_cockroach_proto.Method
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				v |= (github_com_cockroachdb_cockroach_proto.Method(b) & 0x7F) << shift
				if b < 0x80 {
			m.Method = &v
		case 12:
			if wireType != 2 {
				return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType)
			var byteLen int
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				byteLen |= (int(b) & 0x7F) << shift
				if b < 0x80 {
			postIndex := iNdEx + byteLen
			if postIndex > l {
				return io.ErrUnexpectedEOF
			m.Key = append([]byte{}, data[iNdEx:postIndex]...)
			iNdEx = postIndex
		case 13:
			if wireType != 2 {
				return fmt.Errorf("proto: wrong wireType = %d for field Stacks", wireType)
			var byteLen int
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				b := data[iNdEx]
				byteLen |= (int(b) & 0x7F) << shift
				if b < 0x80 {
			postIndex := iNdEx + byteLen
			if postIndex > l {
				return io.ErrUnexpectedEOF
			m.Stacks = append([]byte{}, data[iNdEx:postIndex]...)
			iNdEx = postIndex
			var sizeOfWire int
			for {
				wire >>= 7
				if wire == 0 {
			iNdEx -= sizeOfWire
			skippy, err := skipLog(data[iNdEx:])
			if err != nil {
				return err
			if (iNdEx + skippy) > l {
				return io.ErrUnexpectedEOF
			m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...)
			iNdEx += skippy

	return nil
예제 #23
func (c *Cluster) splitRangeLast() {
	rangeID := proto.RangeID(len(c.ranges) - 1)
예제 #24
// addRange adds a new range to the cluster but does not attach it to any
// store.
func (c *Cluster) addRange() *Range {
	rangeID := proto.RangeID(len(c.ranges))
	newRng := newRange(rangeID)
	c.ranges[rangeID] = newRng
	return newRng
예제 #25
// TestRemoveLeader ensures that a group will recover if a node is
// removed from the group while it is leader. Since visibility into
// the raft state is limited, we create a three-node group in a
// six-node cluster. This group is migrated one node at a time from
// the first three nodes to the last three. In the process the initial
// leader must have removed itself.
func TestRemoveLeader(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	const clusterSize = 6
	const groupSize = 3
	cluster := newTestCluster(nil, clusterSize, stopper, t)
	defer stopper.Stop()

	// Consume and apply the membership change events.
	for i := 0; i < clusterSize; i++ {
		go func(i int) {
			for {
				if e, ok := <-cluster.events[i].MembershipChangeCommitted; ok {
				} else {

	// Tick all the clocks in the background to ensure that all the
	// necessary elections are triggered.
	// TODO(bdarnell): newTestCluster should have an option to use a
	// real clock instead of a manual one.
	stopper.RunWorker(func() {
		ticker := time.NewTicker(10 * time.Millisecond)
		defer ticker.Stop()
		for {
			select {
			case <-stopper.ShouldStop():
			case <-ticker.C:
				for _, t := range cluster.tickers {

	// Create a group with three members.
	groupID := proto.RangeID(1)
	cluster.createGroup(groupID, 0, groupSize)

	// Move the group one node at a time from the first three nodes to
	// the last three. In the process, we necessarily remove the leader
	// and trigger at least one new election among the new nodes.
	for i := 0; i < groupSize; i++ {
		log.Infof("adding node %d", i+groupSize)
		ch := cluster.nodes[i].ChangeGroupMembership(groupID, makeCommandID(),
			cluster.nodes[i+groupSize].nodeID, nil)
		if err := <-ch; err != nil {

		log.Infof("removing node %d", i)
		ch = cluster.nodes[i].ChangeGroupMembership(groupID, makeCommandID(),
			cluster.nodes[i].nodeID, nil)
		if err := <-ch; err != nil {
예제 #26
func testContext() context.Context {
	ctx := context.Background()
	return Add(ctx, NodeID, proto.NodeID(1), StoreID, proto.StoreID(2), RangeID, proto.RangeID(3), Method, proto.Get, Key, proto.Key("key"))
예제 #27
import (


const (
	noGroup = proto.RangeID(0)

	reqBufferSize = 100

// An ErrGroupDeleted is returned for commands which are pending while their
// group is deleted.
var ErrGroupDeleted = errors.New("raft group deleted")

// ErrStopped is returned for commands that could not be completed before the
// node was stopped.
var ErrStopped = errors.New("raft processing stopped")

// Config contains the parameters necessary to construct a MultiRaft object.
type Config struct {
	Storage   Storage
예제 #28
func (c *Cluster) splitRangeRandom() {
	rangeID := proto.RangeID(c.rand.Int63n(int64(len(c.ranges))))
예제 #29
func TestRapidMembershipChange(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	defer stopper.Stop()

	var wg sync.WaitGroup
	proposers := 5

	numCommit := int32(200)

	cluster := newTestCluster(nil, 1, stopper, t)
	groupID := proto.RangeID(1)

	cluster.createGroup(groupID, 0, 1 /* replicas */)
	startSeq := int32(0) // updated atomically from now on

	cmdIDFormat := "%0" + fmt.Sprintf("%d", commandIDLen) + "d"
	teardown := make(chan struct{})

	proposerFn := func(i int) {
		defer wg.Done()

		var seq int32
		for {
			seq = atomic.AddInt32(&startSeq, 1)
			if seq > numCommit {
			cmdID := fmt.Sprintf(cmdIDFormat, seq)
			for {
				if err := cluster.nodes[0].CreateGroup(groupID); err != nil {
				if log.V(1) {
					log.Infof("%-3d: try    %s", i, cmdID)

				select {
				case err := <-cluster.nodes[0].SubmitCommand(groupID,
					cmdID, []byte("command")):
					if err == nil {
						log.Infof("%-3d: ok   %s", i, cmdID)
						break retry
					log.Infof("%-3d: err  %s %s", i, cmdID, err)
				case <-teardown:
			if err := cluster.nodes[0].RemoveGroup(groupID); err != nil {

	for i := 0; i < proposers; i++ {
		go proposerFn(i)

	for e := range cluster.events[0].CommandCommitted {
		if log.V(1) {
			log.Infof("   : recv %s", e.CommandID)
		if fmt.Sprintf(cmdIDFormat, numCommit) == e.CommandID {
			log.Infof("received everything we asked for, ending test")

	// Because ending the test case is racy with the test itself, we wait until
	// all our goroutines have finished their work before we allow the test to
	// forcible terminate. This solves a race condition on `t`, which is
	// otherwise subject to concurrent access from our goroutine and the go
	// testing machinery.
예제 #30
// handleWriteResponse updates the state machine and sends messages for a raft Ready batch.
func (s *state) handleWriteResponse(response *writeResponse, readyGroups map[uint64]raft.Ready) {
	if log.V(6) {
		log.Infof("node %v got write response: %#v", s.nodeID, *response)
	// Everything has been written to disk; now we can apply updates to the state machine
	// and send outgoing messages.
	for groupID, ready := range readyGroups {
		raftGroupID := proto.RangeID(groupID)
		g, ok := s.groups[raftGroupID]
		if !ok {
			if log.V(4) {
				log.Infof("dropping stale write to group %v", groupID)
		} else if !g.writing {
			if log.V(4) {
				log.Infof("dropping stale write to reincarnation of group %v", groupID)
			delete(readyGroups, groupID) // they must not make it to Advance.
		g.writing = false

		// Process committed entries.
		for _, entry := range ready.CommittedEntries {
			commandID := s.processCommittedEntry(raftGroupID, g, entry)
			// TODO(bdarnell): the command is now committed, but not applied until the
			// application consumes EventCommandCommitted. Is returning via the channel
			// at this point useful or do we need to wait for the command to be
			// applied too?
			// This could be done with a Callback as in EventMembershipChangeCommitted
			// or perhaps we should move away from a channel to a callback-based system.
			s.removePending(g, g.pending[commandID], nil /* err */)

		if !raft.IsEmptySnap(ready.Snapshot) {
			// Sync the group/node mapping with the information contained in the snapshot.
			for _, nodeID := range ready.Snapshot.Metadata.ConfState.Nodes {
				// TODO(bdarnell): if we had any information that predated this snapshot
				// we must remove those nodes.
				if err := s.addNode(proto.RaftNodeID(nodeID), g); err != nil {
					log.Errorf("node %v: error adding node %v", s.nodeID, nodeID)

		// Process SoftState and leader changes.
		s.maybeSendLeaderEvent(raftGroupID, g, &ready)

		// Send all messages.
		for _, msg := range ready.Messages {
			switch msg.Type {
			case raftpb.MsgHeartbeat:
				if log.V(8) {
					log.Infof("node %v dropped individual heartbeat to node %v",
						s.nodeID, msg.To)
			case raftpb.MsgHeartbeatResp:
				if log.V(8) {
					log.Infof("node %v dropped individual heartbeat response to node %v",
						s.nodeID, msg.To)
				s.sendMessage(g, msg)