Example #1
0
// TestHeartbeatResponseFanout check 2 raft groups on the same node distribution,
// but each group has different Term, heartbeat response from each group should
// not disturb other group's Term or Leadership
func TestHeartbeatResponseFanout(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	defer stopper.Stop()

	cluster := newTestCluster(nil, 3, stopper, t)
	groupID1 := proto.RaftID(1)
	cluster.createGroup(groupID1, 0, 3 /* replicas */)

	groupID2 := proto.RaftID(2)
	cluster.createGroup(groupID2, 0, 3 /* replicas */)

	leaderIndex := 0

	cluster.elect(leaderIndex, groupID1)
	// GroupID2 will have 3 round of election, so it will have different
	// term with groupID1, but both leader on the same node.
	for i := 2; i >= 0; i-- {
		leaderIndex = i
		cluster.elect(leaderIndex, groupID2)
	}
	// Send a coalesced heartbeat.
	// Heartbeat response from groupID2 will have a big term than which from groupID1.
	cluster.nodes[0].coalescedHeartbeat()
	// Start submit a command to see if groupID1's leader changed?
	cluster.nodes[0].SubmitCommand(groupID1, makeCommandID(), []byte("command"))

	select {
	case _ = <-cluster.events[0].CommandCommitted:
		log.Infof("SubmitCommand succeed after Heartbeat Response fanout")
	case <-time.After(500 * time.Millisecond):
		t.Fatalf("No leader after Heartbeat Response fanout")
	}
}
Example #2
0
// TestHeartbeatResponseFanout check 2 raft groups on the same node distribution,
// but each group has different Term, heartbeat response from each group should
// not disturb other group's Term or Leadership
func TestHeartbeatResponseFanout(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := util.NewStopper()
	defer stopper.Stop()

	cluster := newTestCluster(nil, 3, stopper, t)
	groupID1 := proto.RaftID(1)
	cluster.createGroup(groupID1, 0, 3 /* replicas */)

	groupID2 := proto.RaftID(2)
	cluster.createGroup(groupID2, 0, 3 /* replicas */)

	leaderIndex := 0

	cluster.triggerElection(leaderIndex, groupID1)
	event := cluster.waitForElection(leaderIndex)
	// Drain off the election event from other nodes.
	_ = cluster.waitForElection((leaderIndex + 1) % 3)
	_ = cluster.waitForElection((leaderIndex + 2) % 3)

	if event.GroupID != groupID1 {
		t.Fatalf("election event had incorrect groupid %v", event.GroupID)
	}
	if event.NodeID != cluster.nodes[leaderIndex].nodeID {
		t.Fatalf("expected %v to win election, but was %v", cluster.nodes[leaderIndex].nodeID, event.NodeID)
	}
	// GroupID2 will have 3 round of election, so it will have different
	// term with groupID1, but both leader on the same node.
	for i := 2; i >= 0; i-- {
		leaderIndex = i
		cluster.triggerElection(leaderIndex, groupID2)
		event = cluster.waitForElection(leaderIndex)
		_ = cluster.waitForElection((leaderIndex + 1) % 3)
		_ = cluster.waitForElection((leaderIndex + 2) % 3)

		if event.GroupID != groupID2 {
			t.Fatalf("election event had incorrect groupid %v", event.GroupID)
		}
		if event.NodeID != cluster.nodes[leaderIndex].nodeID {
			t.Fatalf("expected %v to win election, but was %v", cluster.nodes[leaderIndex].nodeID, event.NodeID)
		}
	}
	// Send a coalesced heartbeat.
	// Heartbeat response from groupID2 will have a big term than which from groupID1.
	cluster.nodes[0].coalescedHeartbeat()
	// Start submit a command to see if groupID1's leader changed?
	cluster.nodes[0].SubmitCommand(groupID1, makeCommandID(), []byte("command"))

	select {
	case _ = <-cluster.events[0].CommandCommitted:
		log.Infof("SubmitCommand succeed after Heartbeat Response fanout")
	case <-time.After(500 * time.Millisecond):
		t.Fatalf("No leader after Heartbeat Response fanout")
	}
}
Example #3
0
func TestSlowStorage(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := util.NewStopper()
	cluster := newTestCluster(nil, 3, stopper, t)
	defer stopper.Stop()
	groupID := proto.RaftID(1)
	cluster.createGroup(groupID, 0, 3)

	cluster.triggerElection(0, groupID)
	cluster.waitForElection(0)

	// Block the storage on the last node.
	// TODO(bdarnell): there appear to still be issues if the storage is blocked during
	// the election.
	cluster.storages[2].Block()

	// Submit a command to the leader
	cluster.nodes[0].SubmitCommand(groupID, makeCommandID(), []byte("command"))

	// Even with the third node blocked, the other nodes can make progress.
	for i := 0; i < 2; i++ {
		events := cluster.events[i]
		log.Infof("waiting for event to be commited on node %v", i)
		commit := <-events.CommandCommitted
		if string(commit.Command) != "command" {
			t.Errorf("unexpected value in committed command: %v", commit.Command)
		}
	}

	// Ensure that node 2 is in fact blocked.
	time.Sleep(time.Millisecond)
	select {
	case commit := <-cluster.events[2].CommandCommitted:
		t.Errorf("didn't expect commits on node 2 but got %v", commit)
	default:
	}

	// After unblocking the third node, it will catch up.
	cluster.storages[2].Unblock()
	log.Infof("waiting for event to be commited on node 2")
	// When we unblock, the backlog is not guaranteed to be processed in order,
	// and in some cases the leader may need to retransmit some messages.
	for i := 0; i < 3; i++ {
		select {
		case commit := <-cluster.events[2].CommandCommitted:
			if string(commit.Command) != "command" {
				t.Errorf("unexpected value in committed command: %v", commit.Command)
			}
			return

		case <-time.After(5 * time.Millisecond):
			// Tick both node's clocks. The ticks on the follower node don't
			// really do anything, but they do ensure that that goroutine is
			// getting scheduled (and the real-time delay allows rpc responses
			// to pass between the nodes)
			cluster.tickers[0].Tick()
			cluster.tickers[2].Tick()
		}
	}
}
func TestLeaderCache(t *testing.T) {
	defer leaktest.AfterTest(t)
	lc := newLeaderCache(3)
	if r := lc.Lookup(12); r.StoreID != 0 {
		t.Fatalf("lookup of missing key returned replica: %v", r)
	}
	replica := proto.Replica{StoreID: 1}
	lc.Update(5, replica)
	if r := lc.Lookup(5); r.StoreID != 1 {
		t.Errorf("expected %v, got %v", replica, r)
	}
	newReplica := proto.Replica{StoreID: 7}
	lc.Update(5, newReplica)
	r := lc.Lookup(5)
	if r.StoreID != 7 {
		t.Errorf("expected %v, got %v", newReplica, r)
	}
	lc.Update(5, proto.Replica{})
	r = lc.Lookup(5)
	if r.StoreID != 0 {
		t.Fatalf("evicted leader returned: %v", r)
	}

	for i := 10; i < 20; i++ {
		lc.Update(proto.RaftID(i), replica)
	}
	if lc.Lookup(16).StoreID != 0 || lc.Lookup(17).StoreID == 0 {
		t.Errorf("unexpected policy used in cache")
	}
}
Example #5
0
// sendAttempt is invoked by Send. It temporarily truncates the arguments to
// match the descriptor's EndKey (if necessary) and gathers and rearranges the
// replicas before making a single attempt at sending the request. It returns
// the result of sending the RPC; a potential error contained in the reply has
// to be handled separately by the caller.
func (ds *DistSender) sendAttempt(trace *tracer.Trace, args proto.Request, reply proto.Response, desc *proto.RangeDescriptor) error {
	defer trace.Epoch("sending RPC")()
	// Truncate the request to our current range, making sure not to
	// touch it unless we have to (it is illegal to send EndKey on
	// commands which do not operate on ranges).
	if endKey := args.Header().EndKey; endKey != nil && !endKey.Less(desc.EndKey) {
		defer func(k proto.Key) { args.Header().EndKey = k }(endKey)
		args.Header().EndKey = desc.EndKey
	}
	leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID))

	// Try to send the call.
	replicas := newReplicaSlice(ds.gossip, desc)

	// Rearrange the replicas so that those replicas with long common
	// prefix of attributes end up first. If there's no prefix, this is a
	// no-op.
	order := ds.optimizeReplicaOrder(replicas)

	// If this request needs to go to a leader and we know who that is, move
	// it to the front.
	if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) &&
		leader.StoreID > 0 {
		if i := replicas.FindReplica(leader.StoreID); i >= 0 {
			replicas.MoveToFront(i)
			order = rpc.OrderStable
		}
	}

	return ds.sendRPC(trace, desc.RaftID, replicas, order, args, reply)
}
Example #6
0
// handleWriteReady converts a set of raft.Ready structs into a writeRequest
// to be persisted, marks the group as writing and sends it to the writeTask.
func (s *state) handleWriteReady(readyGroups map[uint64]raft.Ready) {
	if log.V(6) {
		log.Infof("node %v write ready, preparing request", s.nodeID)
	}
	writeRequest := newWriteRequest()
	for groupID, ready := range readyGroups {
		raftGroupID := proto.RaftID(groupID)
		g, ok := s.groups[raftGroupID]
		if !ok {
			if log.V(6) {
				log.Infof("dropping write request to group %d", groupID)
			}
			continue
		}
		g.writing = true

		gwr := &groupWriteRequest{}
		if !raft.IsEmptyHardState(ready.HardState) {
			gwr.state = ready.HardState
		}
		if !raft.IsEmptySnap(ready.Snapshot) {
			gwr.snapshot = ready.Snapshot
		}
		if len(ready.Entries) > 0 {
			gwr.entries = ready.Entries
		}
		writeRequest.groups[raftGroupID] = gwr
	}
	s.writeTask.in <- writeRequest
}
Example #7
0
// newTestRangeSet creates a new range set that has the count number of ranges.
func newTestRangeSet(count int, t *testing.T) *testRangeSet {
	rs := &testRangeSet{rangesByKey: btree.New(64 /* degree */)}
	for i := 0; i < count; i++ {
		desc := &proto.RangeDescriptor{
			RaftID:   proto.RaftID(i),
			StartKey: proto.Key(fmt.Sprintf("%03d", i)),
			EndKey:   proto.Key(fmt.Sprintf("%03d", i+1)),
		}
		// Initialize the range stat so the scanner can use it.
		rng := &Range{
			stats: &rangeStats{
				raftID: desc.RaftID,
				MVCCStats: engine.MVCCStats{
					KeyBytes:  1,
					ValBytes:  2,
					KeyCount:  1,
					LiveCount: 1,
				},
			},
		}
		if err := rng.setDesc(desc); err != nil {
			t.Fatal(err)
		}
		if exRngItem := rs.rangesByKey.ReplaceOrInsert(rng); exRngItem != nil {
			t.Fatalf("failed to insert range %s", rng)
		}
	}
	return rs
}
Example #8
0
// TestRaftAfterRemoveRange verifies that the MultiRaft state removes
// a remote node correctly after the Replica was removed from the Store.
func TestRaftAfterRemoveRange(t *testing.T) {
	defer leaktest.AfterTest(t)
	mtc := startMultiTestContext(t, 3)
	defer mtc.Stop()

	// Make the split.
	splitArgs := adminSplitArgs(proto.KeyMin, []byte("b"), proto.RaftID(1), mtc.stores[0].StoreID())
	if _, err := mtc.stores[0].ExecuteCmd(context.Background(), &splitArgs); err != nil {
		t.Fatal(err)
	}

	raftID := proto.RaftID(2)
	mtc.replicateRange(raftID, 0, 1, 2)

	mtc.unreplicateRange(raftID, 0, 2)
	mtc.unreplicateRange(raftID, 0, 1)

	rng, err := mtc.stores[1].GetRange(raftID)
	if err != nil {
		t.Fatal(err)
	}

	// If the range removal happens before the range applies the replica config change, the group
	// will be re-created when MultiRaft receives a MsgApp.
	if err := util.IsTrueWithin(func() bool {
		return len(rng.Desc().Replicas) == 1
	}, 1*time.Second); err != nil {
		t.Fatal(err)
	}

	// Remove the range from the second Store.
	if err := mtc.stores[1].RemoveRange(rng); err != nil {
		t.Fatal(err)
	}

	if err := mtc.transport.Send(&multiraft.RaftMessageRequest{
		GroupID: proto.RaftID(0),
		Message: raftpb.Message{
			From: uint64(mtc.stores[2].RaftNodeID()),
			To:   uint64(mtc.stores[1].RaftNodeID()),
			Type: raftpb.MsgHeartbeat,
		}}); err != nil {
		t.Fatal(err)
	}
	// Execute another replica change to ensure that MultiRaft has processed the heartbeat just sent.
	mtc.replicateRange(proto.RaftID(1), 0, 1)
}
Example #9
0
func TestLocalSenderLookupReplica(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	defer stopper.Stop()
	ctx := storage.TestStoreContext
	manualClock := hlc.NewManualClock(0)
	ctx.Clock = hlc.NewClock(manualClock.UnixNano)
	ls := NewLocalSender()

	// Create two new stores with ranges we care about.
	var e [2]engine.Engine
	var s [2]*storage.Store
	ranges := []struct {
		storeID    proto.StoreID
		start, end proto.Key
	}{
		{2, proto.Key("a"), proto.Key("c")},
		{3, proto.Key("x"), proto.Key("z")},
	}
	for i, rng := range ranges {
		e[i] = engine.NewInMem(proto.Attributes{}, 1<<20)
		ctx.Transport = multiraft.NewLocalRPCTransport(stopper)
		defer ctx.Transport.Close()
		s[i] = storage.NewStore(ctx, e[i], &proto.NodeDescriptor{NodeID: 1})
		s[i].Ident.StoreID = rng.storeID

		desc := &proto.RangeDescriptor{
			RaftID:   proto.RaftID(i),
			StartKey: rng.start,
			EndKey:   rng.end,
			Replicas: []proto.Replica{{StoreID: rng.storeID}},
		}
		newRng, err := storage.NewRange(desc, s[i])
		if err != nil {
			t.Fatal(err)
		}
		if err := s[i].AddRangeTest(newRng); err != nil {
			t.Error(err)
		}
		ls.AddStore(s[i])
	}

	if _, r, err := ls.lookupReplica(proto.Key("a"), proto.Key("c")); r.StoreID != s[0].Ident.StoreID || err != nil {
		t.Errorf("expected store %d; got %d: %v", s[0].Ident.StoreID, r.StoreID, err)
	}
	if _, r, err := ls.lookupReplica(proto.Key("b"), nil); r.StoreID != s[0].Ident.StoreID || err != nil {
		t.Errorf("expected store %d; got %d: %v", s[0].Ident.StoreID, r.StoreID, err)
	}
	if _, r, err := ls.lookupReplica(proto.Key("b"), proto.Key("d")); r != nil || err == nil {
		t.Errorf("expected store 0 and error got %d", r.StoreID)
	}
	if _, r, err := ls.lookupReplica(proto.Key("x"), proto.Key("z")); r.StoreID != s[1].Ident.StoreID {
		t.Errorf("expected store %d; got %d: %v", s[1].Ident.StoreID, r.StoreID, err)
	}
	if _, r, err := ls.lookupReplica(proto.Key("y"), nil); r.StoreID != s[1].Ident.StoreID || err != nil {
		t.Errorf("expected store %d; got %d: %v", s[1].Ident.StoreID, r.StoreID, err)
	}
}
Example #10
0
// DecodeRaftStateKey extracts the Raft ID from a RaftStateKey.
func DecodeRaftStateKey(key proto.Key) proto.RaftID {
	if !bytes.HasPrefix(key, LocalRangeIDPrefix) {
		panic(fmt.Sprintf("key %q does not have %q prefix", key, LocalRangeIDPrefix))
	}
	// Cut the prefix and the Raft ID.
	b := key[len(LocalRangeIDPrefix):]
	_, raftID := encoding.DecodeUvarint(b)
	return proto.RaftID(raftID)
}
Example #11
0
func TestMembershipChange(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	cluster := newTestCluster(nil, 4, stopper, t)
	defer stopper.Stop()

	// Create a group with a single member, cluster.nodes[0].
	groupID := proto.RaftID(1)
	cluster.createGroup(groupID, 0, 1)
	// An automatic election is triggered since this is a single-node Raft group,
	// so we don't need to call triggerElection.

	// Consume and apply the membership change events.
	for i := 0; i < 4; i++ {
		go func(i int) {
			for {
				e, ok := <-cluster.events[i].MembershipChangeCommitted
				if !ok {
					return
				}
				e.Callback(nil)
			}
		}(i)
	}

	// Add each of the other three nodes to the cluster.
	for i := 1; i < 4; i++ {
		ch := cluster.nodes[0].ChangeGroupMembership(groupID, makeCommandID(),
			raftpb.ConfChangeAddNode,
			cluster.nodes[i].nodeID, nil)
		<-ch
	}

	// TODO(bdarnell): verify that the channel events are sent out correctly.
	/*
		for i := 0; i < 10; i++ {
			log.Infof("tick %d", i)
			cluster.tickers[0].Tick()
			time.Sleep(5 * time.Millisecond)
		}

		// Each node is notified of each other node's joining.
		for i := 0; i < 4; i++ {
			for j := 1; j < 4; j++ {
				select {
				case e := <-cluster.events[i].MembershipChangeCommitted:
					if e.NodeID != cluster.nodes[j].nodeID {
						t.Errorf("node %d expected event for %d, got %d", i, j, e.NodeID)
					}
				default:
					t.Errorf("node %d did not get expected event for %d", i, j)
				}
			}
		}*/
}
Example #12
0
func TestLeaderElectionEvent(t *testing.T) {
	defer leaktest.AfterTest(t)
	// Leader election events are fired when the leader commits an entry, not when it
	// issues a call for votes.
	stopper := stop.NewStopper()
	cluster := newTestCluster(nil, 3, stopper, t)
	defer stopper.Stop()
	groupID := proto.RaftID(1)
	cluster.createGroup(groupID, 0, 3)

	// Process a Ready with a new leader but no new commits.
	// This happens while an election is in progress.
	cluster.nodes[1].maybeSendLeaderEvent(groupID, cluster.nodes[1].groups[groupID],
		&raft.Ready{
			SoftState: &raft.SoftState{
				Lead: 3,
			},
		})

	// No events are sent.
	select {
	case e := <-cluster.events[1].LeaderElection:
		t.Fatalf("got unexpected event %v", e)
	case <-time.After(time.Millisecond):
	}

	// Now there are new committed entries. A new leader always commits an entry
	// to conclude the election.
	entry := raftpb.Entry{
		Index: 42,
		Term:  42,
	}
	cluster.nodes[1].maybeSendLeaderEvent(groupID, cluster.nodes[1].groups[groupID],
		&raft.Ready{
			Entries:          []raftpb.Entry{entry},
			CommittedEntries: []raftpb.Entry{entry},
		})

	// Now we get an event.
	select {
	case e := <-cluster.events[1].LeaderElection:
		if !reflect.DeepEqual(e, &EventLeaderElection{
			GroupID: groupID,
			NodeID:  3,
			Term:    42,
		}) {
			t.Errorf("election event did not match expectations: %+v", e)
		}
	case <-time.After(time.Millisecond):
		t.Fatal("didn't get expected event")
	}
}
Example #13
0
// TestReplicateAfterSplit verifies that a new replica whose start key
// is not KeyMin replicating to a fresh store can apply snapshots correctly.
func TestReplicateAfterSplit(t *testing.T) {
	defer leaktest.AfterTest(t)
	mtc := startMultiTestContext(t, 2)
	defer mtc.Stop()

	raftID := proto.RaftID(1)
	splitKey := proto.Key("m")
	key := proto.Key("z")

	store0 := mtc.stores[0]
	// Make the split
	splitArgs := adminSplitArgs(proto.KeyMin, splitKey, raftID, store0.StoreID())
	if _, err := store0.ExecuteCmd(context.Background(), &splitArgs); err != nil {
		t.Fatal(err)
	}

	raftID2 := store0.LookupRange(key, nil).Desc().RaftID
	if raftID2 == raftID {
		t.Errorf("got same raft id after split")
	}
	// Issue an increment for later check.
	incArgs := incrementArgs(key, 11, raftID2, store0.StoreID())
	if _, err := store0.ExecuteCmd(context.Background(), &incArgs); err != nil {
		t.Fatal(err)
	}
	// Now add the second replica.
	mtc.replicateRange(raftID2, 0, 1)

	if mtc.stores[1].LookupRange(key, nil).GetMaxBytes() == 0 {
		t.Error("Range MaxBytes is not set after snapshot applied")
	}
	// Once it catches up, the effects of increment commands can be seen.
	if err := util.IsTrueWithin(func() bool {
		getArgs := getArgs(key, raftID2, mtc.stores[1].StoreID())
		// Reading on non-leader replica should use inconsistent read
		getArgs.ReadConsistency = proto.INCONSISTENT
		reply, err := mtc.stores[1].ExecuteCmd(context.Background(), &getArgs)
		if err != nil {
			return false
		}
		getResp := reply.(*proto.GetResponse)
		if log.V(1) {
			log.Infof("read value %d", mustGetInteger(getResp.Value))
		}
		return mustGetInteger(getResp.Value) == 11
	}, 1*time.Second); err != nil {
		t.Fatal(err)
	}
}
Example #14
0
func TestInitialLeaderElection(t *testing.T) {
	defer leaktest.AfterTest(t)
	// Run the test three times, each time triggering a different node's election clock.
	// The node that requests an election first should win.
	for leaderIndex := 0; leaderIndex < 3; leaderIndex++ {
		log.Infof("testing leader election for node %v", leaderIndex)
		stopper := stop.NewStopper()
		cluster := newTestCluster(nil, 3, stopper, t)
		groupID := proto.RaftID(1)
		cluster.createGroup(groupID, 0, 3)

		cluster.elect(leaderIndex, groupID)
		stopper.Stop()
	}
}
// TestProgressWithDownNode verifies that a surviving quorum can make progress
// with a downed node.
func TestProgressWithDownNode(t *testing.T) {
	defer leaktest.AfterTest(t)
	mtc := startMultiTestContext(t, 3)
	defer mtc.Stop()

	raftID := proto.RaftID(1)
	mtc.replicateRange(raftID, 0, 1, 2)

	incArgs, incResp := incrementArgs([]byte("a"), 5, raftID, mtc.stores[0].StoreID())
	if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil {
		t.Fatal(err)
	}

	// Verify that the first increment propagates to all the engines.
	verify := func(expected []int64) {
		util.SucceedsWithin(t, time.Second, func() error {
			values := []int64{}
			for _, eng := range mtc.engines {
				val, _, err := engine.MVCCGet(eng, proto.Key("a"), mtc.clock.Now(), true, nil)
				if err != nil {
					return err
				}
				values = append(values, mustGetInteger(val))
			}
			if !reflect.DeepEqual(expected, values) {
				return util.Errorf("expected %v, got %v", expected, values)
			}
			return nil
		})
	}
	verify([]int64{5, 5, 5})

	// Stop one of the replicas and issue a new increment.
	mtc.stopStore(1)
	incArgs, incResp = incrementArgs([]byte("a"), 11, raftID, mtc.stores[0].StoreID())
	if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil {
		t.Fatal(err)
	}

	// The new increment can be seen on both live replicas.
	verify([]int64{16, 5, 16})

	// Once the downed node is restarted, it will catch up.
	mtc.restartStore(1)
	verify([]int64{16, 16, 16})
}
Example #16
0
func TestSlowStorage(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := util.NewStopper()
	cluster := newTestCluster(nil, 3, stopper, t)
	defer stopper.Stop()
	groupID := proto.RaftID(1)
	cluster.createGroup(groupID, 0, 3)

	cluster.triggerElection(0, groupID)
	cluster.waitForElection(0)

	// Block the storage on the last node.
	// TODO(bdarnell): there appear to still be issues if the storage is blocked during
	// the election.
	cluster.storages[2].Block()

	// Submit a command to the leader
	cluster.nodes[0].SubmitCommand(groupID, makeCommandID(), []byte("command"))

	// Even with the third node blocked, the other nodes can make progress.
	for i := 0; i < 2; i++ {
		events := cluster.events[i]
		log.Infof("waiting for event to be commited on node %v", i)
		commit := <-events.CommandCommitted
		if string(commit.Command) != "command" {
			t.Errorf("unexpected value in committed command: %v", commit.Command)
		}
	}

	// Ensure that node 2 is in fact blocked.
	time.Sleep(time.Millisecond)
	select {
	case commit := <-cluster.events[2].CommandCommitted:
		t.Errorf("didn't expect commits on node 2 but got %v", commit)
	default:
	}

	// After unblocking the third node, it will catch up.
	cluster.storages[2].Unblock()
	cluster.tickers[0].Tick()
	log.Infof("waiting for event to be commited on node 2")
	commit := <-cluster.events[2].CommandCommitted
	if string(commit.Command) != "command" {
		t.Errorf("unexpected value in committed command: %v", commit.Command)
	}
}
Example #17
0
// TestStoreRaftIDAllocation verifies that raft IDs are
// allocated in successive blocks.
func TestStoreRaftIDAllocation(t *testing.T) {
	defer leaktest.AfterTest(t)
	store, _, stopper := createTestStore(t)
	defer stopper.Stop()

	// Raft IDs should be allocated from ID 2 (first alloc'd range)
	// to raftIDAllocCount * 3 + 1.
	for i := 0; i < raftIDAllocCount*3; i++ {
		replicas := []proto.Replica{{StoreID: store.StoreID()}}
		desc, err := store.NewRangeDescriptor(proto.Key(fmt.Sprintf("%03d", i)), proto.Key(fmt.Sprintf("%03d", i+1)), replicas)
		if err != nil {
			t.Fatal(err)
		}
		if desc.RaftID != proto.RaftID(2+i) {
			t.Errorf("expected Raft id %d; got %d", 2+i, desc.RaftID)
		}
	}
}
// TestRangeGCQueueDropReplica verifies that the range GC queue
// removes a range from a store that no longer should have a replica.
func TestRangeGCQueueDropReplica(t *testing.T) {
	defer leaktest.AfterTest(t)

	mtc := startMultiTestContext(t, 3)
	defer mtc.Stop()
	raftID := proto.RaftID(1)
	mtc.replicateRange(raftID, 0, 1, 2)

	mtc.unreplicateRange(raftID, 0, 1)

	// Increment the clock's timestamp to expire the leader lease.
	mtc.manualClock.Increment(int64(storage.DefaultLeaderLeaseDuration) + 1)

	// Make sure the range is not yet removed from the store.
	numTrials := 3
	for i := 0; i < numTrials; i++ {
		store := mtc.stores[1]
		store.ForceRangeGCScan(t)
		if _, err := store.GetRange(raftID); err != nil {
			t.Error("unexpected range removal")
		}
		time.Sleep(10 * time.Millisecond)
	}

	// Increment the clock's timestamp to make the range GC queue process the range.
	mtc.manualClock.Increment(int64(storage.RangeGCQueueUnleasedDuration))

	// Make sure the range is removed from the store.
	util.SucceedsWithin(t, time.Second, func() error {
		store := mtc.stores[1]
		store.ForceRangeGCScan(t)
		if _, err := store.GetRange(raftID); err == nil {
			return util.Error("expected range removal")
		}
		return nil
	})

	// Restart the store to tear down the test cleanly.
	mtc.stopStore(1)
	mtc.restartStore(1)
}
Example #19
0
func TestCommand(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	cluster := newTestCluster(nil, 3, stopper, t)
	defer stopper.Stop()
	groupID := proto.RaftID(1)
	cluster.createGroup(groupID, 0, 3)
	cluster.triggerElection(0, groupID)

	// Submit a command to the leader
	cluster.nodes[0].SubmitCommand(groupID, makeCommandID(), []byte("command"))

	// The command will be committed on each node.
	for i, events := range cluster.events {
		log.Infof("waiting for event to be committed on node %v", i)
		commit := <-events.CommandCommitted
		if string(commit.Command) != "command" {
			t.Errorf("unexpected value in committed command: %v", commit.Command)
		}
	}
}
Example #20
0
func TestInitialLeaderElection(t *testing.T) {
	defer leaktest.AfterTest(t)
	// Run the test three times, each time triggering a different node's election clock.
	// The node that requests an election first should win.
	for leaderIndex := 0; leaderIndex < 3; leaderIndex++ {
		log.Infof("testing leader election for node %v", leaderIndex)
		stopper := util.NewStopper()
		cluster := newTestCluster(nil, 3, stopper, t)
		groupID := proto.RaftID(1)
		cluster.createGroup(groupID, 0, 3)

		cluster.triggerElection(leaderIndex, groupID)
		event := cluster.waitForElection(leaderIndex)
		if event.GroupID != groupID {
			t.Fatalf("election event had incorrect group id %v", event.GroupID)
		}
		if event.NodeID != cluster.nodes[leaderIndex].nodeID {
			t.Fatalf("expected %v to win election, but was %v", cluster.nodes[leaderIndex].nodeID,
				event.NodeID)
		}
		stopper.Stop()
	}
}
Example #21
0
// handleWriteResponse updates the state machine and sends messages for a raft Ready batch.
func (s *state) handleWriteResponse(response *writeResponse, readyGroups map[uint64]raft.Ready) {
	if log.V(6) {
		log.Infof("node %v got write response: %#v", s.nodeID, *response)
	}
	// Everything has been written to disk; now we can apply updates to the state machine
	// and send outgoing messages.
	for groupID, ready := range readyGroups {
		raftGroupID := proto.RaftID(groupID)
		g, ok := s.groups[raftGroupID]
		if !ok {
			if log.V(4) {
				log.Infof("dropping stale write to group %v", groupID)
			}
			continue
		} else if !g.writing {
			if log.V(4) {
				log.Infof("dropping stale write to reincarnation of group %v", groupID)
			}
			delete(readyGroups, groupID) // they must not make it to Advance.
			continue
		}
		g.writing = false

		// Process committed entries.
		for _, entry := range ready.CommittedEntries {
			commandID := s.processCommittedEntry(raftGroupID, g, entry)
			// TODO(bdarnell): the command is now committed, but not applied until the
			// application consumes EventCommandCommitted. Is returning via the channel
			// at this point useful or do we need to wait for the command to be
			// applied too?
			// This could be done with a Callback as in EventMembershipChangeCommitted
			// or perhaps we should move away from a channel to a callback-based system.
			s.removePending(g, g.pending[commandID], nil /* err */)
		}

		if !raft.IsEmptySnap(ready.Snapshot) {
			// Sync the group/node mapping with the information contained in the snapshot.
			for _, nodeID := range ready.Snapshot.Metadata.ConfState.Nodes {
				// TODO(bdarnell): if we had any information that predated this snapshot
				// we must remove those nodes.
				if err := s.addNode(proto.RaftNodeID(nodeID), raftGroupID); err != nil {
					log.Errorf("node %v: error adding node %v", s.nodeID, nodeID)
				}
			}
		}

		// Process SoftState and leader changes.
		s.maybeSendLeaderEvent(raftGroupID, g, &ready)

		// Send all messages.
		for _, msg := range ready.Messages {
			switch msg.Type {
			case raftpb.MsgHeartbeat:
				if log.V(8) {
					log.Infof("node %v dropped individual heartbeat to node %v",
						s.nodeID, msg.To)
				}
			case raftpb.MsgHeartbeatResp:
				if log.V(8) {
					log.Infof("node %v dropped individual heartbeat response to node %v",
						s.nodeID, msg.To)
				}
			default:
				s.sendMessage(raftGroupID, msg)
			}
		}
	}
}
Example #22
0
// processCommittedEntry tells the application that a command was committed.
// Returns the commandID, or an empty string if the given entry was not a command.
func (s *state) processCommittedEntry(groupID proto.RaftID, g *group, entry raftpb.Entry) string {
	var commandID string
	switch entry.Type {
	case raftpb.EntryNormal:
		// etcd raft occasionally adds a nil entry (e.g. upon election); ignore these.
		if entry.Data != nil {
			var command []byte
			commandID, command = decodeCommand(entry.Data)
			s.sendEvent(&EventCommandCommitted{
				GroupID:   groupID,
				CommandID: commandID,
				Command:   command,
				Index:     entry.Index,
			})
		}

	case raftpb.EntryConfChange:
		cc := raftpb.ConfChange{}
		if err := cc.Unmarshal(entry.Data); err != nil {
			log.Fatalf("invalid ConfChange data: %s", err)
		}
		var payload []byte
		if len(cc.Context) > 0 {
			commandID, payload = decodeCommand(cc.Context)
		}
		s.sendEvent(&EventMembershipChangeCommitted{
			GroupID:    groupID,
			CommandID:  commandID,
			Index:      entry.Index,
			NodeID:     proto.RaftNodeID(cc.NodeID),
			ChangeType: cc.Type,
			Payload:    payload,
			Callback: func(err error) {
				select {
				case s.callbackChan <- func() {
					if err == nil {
						if log.V(3) {
							log.Infof("node %v applying configuration change %v", s.nodeID, cc)
						}
						// TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs
						switch cc.Type {
						case raftpb.ConfChangeAddNode:
							err = s.addNode(proto.RaftNodeID(cc.NodeID), proto.RaftID(groupID))
						case raftpb.ConfChangeRemoveNode:
							// TODO(bdarnell): support removing nodes; fix double-application of initial entries
						case raftpb.ConfChangeUpdateNode:
							// Updates don't concern multiraft, they are simply passed through.
						}
						if err != nil {
							log.Errorf("error applying configuration change %v: %s", cc, err)
						}
						s.multiNode.ApplyConfChange(uint64(groupID), cc)
					} else {
						log.Warningf("aborting configuration change: %s", err)
						s.multiNode.ApplyConfChange(uint64(groupID),
							raftpb.ConfChange{})
					}

					// Re-submit all pending proposals, in case any of them were config changes
					// that were dropped due to the one-at-a-time rule. This is a little
					// redundant since most pending proposals won't benefit from this but
					// config changes should be rare enough (and the size of the pending queue
					// small enough) that it doesn't really matter.
					for _, prop := range g.pending {
						s.propose(prop)
					}
				}:
				case <-s.stopper.ShouldStop():
				}
			},
		})
	}
	return commandID
}
func TestReplicateAddAndRemove(t *testing.T) {
	defer leaktest.AfterTest(t)

	// Run the test twice, once adding the replacement before removing
	// the downed node, and once removing the downed node first.
	for _, addFirst := range []bool{true, false} {
		mtc := startMultiTestContext(t, 4)
		defer mtc.Stop()

		// Replicate the initial range to three of the four nodes.
		raftID := proto.RaftID(1)
		mtc.replicateRange(raftID, 0, 3, 1)

		incArgs, incResp := incrementArgs([]byte("a"), 5, raftID, mtc.stores[0].StoreID())
		if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil {
			t.Fatal(err)
		}

		verify := func(expected []int64) {
			util.SucceedsWithin(t, time.Second, func() error {
				values := []int64{}
				for _, eng := range mtc.engines {
					val, _, err := engine.MVCCGet(eng, proto.Key("a"), mtc.clock.Now(), true, nil)
					if err != nil {
						return err
					}
					values = append(values, mustGetInteger(val))
				}
				if !reflect.DeepEqual(expected, values) {
					return util.Errorf("expected %v, got %v", expected, values)
				}
				return nil
			})
		}

		// The first increment is visible on all three replicas.
		verify([]int64{5, 5, 0, 5})

		// Stop a store and replace it.
		mtc.stopStore(1)
		if addFirst {
			mtc.replicateRange(raftID, 0, 2)
			mtc.unreplicateRange(raftID, 0, 1)
		} else {
			mtc.unreplicateRange(raftID, 0, 1)
			mtc.replicateRange(raftID, 0, 2)
		}
		verify([]int64{5, 5, 5, 5})

		// Ensure that the rest of the group can make progress.
		incArgs, incResp = incrementArgs([]byte("a"), 11, raftID, mtc.stores[0].StoreID())
		if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil {
			t.Fatal(err)
		}
		verify([]int64{16, 5, 16, 16})

		// Bring the downed store back up (required for a clean shutdown).
		mtc.restartStore(1)

		// Node 1 never sees the increment that was added while it was
		// down. Perform another increment on the live nodes to verify.
		incArgs, incResp = incrementArgs([]byte("a"), 23, raftID, mtc.stores[0].StoreID())
		if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil {
			t.Fatal(err)
		}
		verify([]int64{39, 5, 39, 39})

		// TODO(bdarnell): when we have GC of removed ranges, verify that
		// the downed node removes the data from this range after coming
		// back up.

		// Wait out the leader lease and the unleased duration to make the range GC'able.
		mtc.manualClock.Increment(int64(storage.DefaultLeaderLeaseDuration) +
			int64(storage.RangeGCQueueUnleasedDuration) + 1)
		mtc.stores[1].ForceRangeGCScan(t)

		// The removed store no longer has any of the data from the range.
		verify([]int64{39, 0, 39, 39})
	}
}
Example #24
0
import (
	"errors"
	"fmt"
	"time"

	"github.com/cockroachdb/cockroach/proto"
	"github.com/cockroachdb/cockroach/util"
	"github.com/cockroachdb/cockroach/util/log"
	"github.com/coreos/etcd/raft"
	"github.com/coreos/etcd/raft/raftpb"
	"golang.org/x/net/context"
)

const (
	noGroup = proto.RaftID(0)
)

// An ErrGroupDeleted is returned for commands which are pending while their
// group is deleted.
var ErrGroupDeleted = errors.New("group deleted")

// ErrStopped is returned for commands that could not be completed before the
// node was stopped.
var ErrStopped = errors.New("stopped")

// Config contains the parameters necessary to construct a MultiRaft object.
type Config struct {
	Storage   Storage
	Transport Transport
	// Ticker may be nil to use real time and TickInterval.
Example #25
0
// TestMultiStoreEventFeed verifies that events on multiple stores are properly
// recieved by a single event reader.
func TestMultiStoreEventFeed(t *testing.T) {
	defer leaktest.AfterTest(t)
	t.Skip("disabled until #1531 is fixed")

	// Create a multiTestContext which publishes all store events to the given
	// feed.
	feed := &util.Feed{}
	mtc := &multiTestContext{
		feed: feed,
	}

	// Start reading events from the feed before starting the stores.
	ser := &storeEventReader{
		recordUpdateDetail: false,
	}
	readStopper := stop.NewStopper()
	sub := feed.Subscribe()
	readStopper.RunWorker(func() {
		ser.readEvents(sub)
	})

	mtc.Start(t, 3)
	defer mtc.Stop()

	// Replicate the default range.
	raftID := proto.RaftID(1)
	mtc.replicateRange(raftID, 0, 1, 2)

	// Add some data in a transaction
	err := mtc.db.Txn(func(txn *client.Txn) error {
		b := &client.Batch{}
		b.Put("a", "asdf")
		b.Put("c", "jkl;")
		return txn.Commit(b)
	})
	if err != nil {
		t.Fatalf("error putting data to db: %s", err)
	}

	// AdminSplit in between the two ranges.
	if err := mtc.db.AdminSplit("b"); err != nil {
		t.Fatalf("error splitting initial: %s", err)
	}

	// AdminSplit an empty range at the end of the second range.
	if err := mtc.db.AdminSplit("z"); err != nil {
		t.Fatalf("error splitting second range: %s", err)
	}

	// AdminMerge the empty range back into the second range.
	if err := mtc.db.AdminMerge("c"); err != nil {
		t.Fatalf("error merging final range: %s", err)
	}

	// Add an additional put through the system and wait for all
	// replicas to receive it.
	if _, err := mtc.db.Inc("aa", 5); err != nil {
		t.Fatalf("error putting data to db: %s", err)
	}
	util.SucceedsWithin(t, time.Second, func() error {
		for _, eng := range mtc.engines {
			val, _, err := engine.MVCCGet(eng, proto.Key("aa"), mtc.clock.Now(), true, nil)
			if err != nil {
				return err
			}
			if a, e := mustGetInteger(val), int64(5); a != e {
				return util.Errorf("expected aa = %d, got %d", e, a)
			}
		}
		return nil
	})

	// Close feed and wait for reader to receive all events.
	feed.Close()
	readStopper.Stop()

	// Compare events to expected values.
	expected := map[proto.StoreID][]string{
		proto.StoreID(1): {
			"StartStore",
			"BeginScanRanges",
			"RegisterRange scan=true, rid=1, live=.*",
			"EndScanRanges",
			"SplitRange origId=1, newId=2, origKey=336, newKey=15",
			"SplitRange origId=2, newId=3, origKey=15, newKey=0",
			"MergeRange rid=2, subId=3, key=15, subKey=0",
		},
		proto.StoreID(2): {
			"StartStore",
			"BeginScanRanges",
			"EndScanRanges",
			"RegisterRange scan=false, rid=1, live=.*",
			"SplitRange origId=1, newId=2, origKey=336, newKey=15",
			"SplitRange origId=2, newId=3, origKey=15, newKey=0",
			"MergeRange rid=2, subId=3, key=15, subKey=0",
		},
		proto.StoreID(3): {
			"StartStore",
			"BeginScanRanges",
			"EndScanRanges",
			"RegisterRange scan=false, rid=1, live=.*",
			"SplitRange origId=1, newId=2, origKey=336, newKey=15",
			"SplitRange origId=2, newId=3, origKey=15, newKey=0",
			"MergeRange rid=2, subId=3, key=15, subKey=0",
		},
	}
	if a, e := ser.perStoreFeeds, expected; !checkMatch(e, a) {
		t.Errorf("event feed did not match expected value. Actual values have been printed to compare with above expectation.\n")
		t.Logf("Event feed information:\n%s", ser.eventFeedString())
	}

	// Expected count of update events on a per-method basis.
	expectedUpdateCount := map[proto.StoreID]map[proto.Method]int{
		proto.StoreID(1): {
			proto.Put:                 18,
			proto.ConditionalPut:      7,
			proto.Increment:           2,
			proto.Delete:              2,
			proto.EndTransaction:      6,
			proto.InternalLeaderLease: 3,
		},
		proto.StoreID(2): {
			proto.Put:                 16,
			proto.ConditionalPut:      6,
			proto.Increment:           2,
			proto.Delete:              2,
			proto.EndTransaction:      5,
			proto.InternalLeaderLease: 2,
		},
		proto.StoreID(3): {
			proto.Put:                 14,
			proto.ConditionalPut:      5,
			proto.Increment:           2,
			proto.Delete:              2,
			proto.EndTransaction:      4,
			proto.InternalLeaderLease: 2,
		},
	}
	if a, e := ser.perStoreUpdateCount, expectedUpdateCount; !reflect.DeepEqual(a, e) {
		t.Errorf("update counts did not match expected value. Actual values have been printed to compare with above expectation.\n")
		t.Logf("Update count information:\n%s", ser.updateCountString())
	}
}
Example #26
0
// Send implements the client.Sender interface. It verifies
// permissions and looks up the appropriate range based on the
// supplied key and sends the RPC according to the specified options.
//
// If the request spans multiple ranges (which is possible for Scan or
// DeleteRange requests), Send sends requests to the individual ranges
// sequentially and combines the results transparently.
//
// This may temporarily adjust the request headers, so the proto.Call
// must not be used concurrently until Send has returned.
func (ds *DistSender) Send(_ context.Context, call proto.Call) {
	args := call.Args
	finalReply := call.Reply

	// Verify permissions.
	if err := ds.verifyPermissions(call.Args); err != nil {
		call.Reply.Header().SetGoError(err)
		return
	}

	// In the event that timestamp isn't set and read consistency isn't
	// required, set the timestamp using the local clock.
	if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) {
		// Make sure that after the call, args hasn't changed.
		defer func(timestamp proto.Timestamp) {
			args.Header().Timestamp = timestamp
		}(args.Header().Timestamp)
		args.Header().Timestamp = ds.clock.Now()
	}

	// If this is a bounded request, we will change its bound as we receive
	// replies. This undoes that when we return.
	boundedArgs, argsBounded := args.(proto.Bounded)

	if argsBounded {
		defer func(bound int64) {
			boundedArgs.SetBound(bound)
		}(boundedArgs.GetBound())
	}

	defer func(key proto.Key) {
		args.Header().Key = key
	}(args.Header().Key)

	// Retry logic for lookup of range by key and RPCs to range replicas.
	curReply := finalReply
	for {
		call.Reply = curReply
		curReply.Header().Reset()

		var desc, descNext *proto.RangeDescriptor
		var err error
		for r := retry.Start(ds.rpcRetryOptions); r.Next(); {
			// Get range descriptor (or, when spanning range, descriptors).
			// sendAttempt below may clear them on certain errors, so we
			// refresh (likely from the cache) on every retry.
			desc, descNext, err = ds.getDescriptors(call)
			// getDescriptors may fail retryably if the first range isn't
			// available via Gossip.
			if err != nil {
				if rErr, ok := err.(util.Retryable); ok && rErr.CanRetry() {
					if log.V(1) {
						log.Warning(err)
					}
					continue
				}
				break
			}
			err = func() error {
				// Truncate the request to our current range, making sure not to
				// touch it unless we have to (it is illegal to send EndKey on
				// commands which do not operate on ranges).
				if descNext != nil {
					defer func(endKey proto.Key) {
						args.Header().EndKey = endKey
					}(args.Header().EndKey)
					args.Header().EndKey = desc.EndKey
				}
				leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID))

				// Try to send the call.
				replicas := newReplicaSlice(ds.gossip, desc)

				// Rearrange the replicas so that those replicas with long common
				// prefix of attributes end up first. If there's no prefix, this is a
				// no-op.
				order := ds.optimizeReplicaOrder(replicas)

				// If this request needs to go to a leader and we know who that is, move
				// it to the front.
				if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) &&
					leader.StoreID > 0 {
					if i := replicas.FindReplica(leader.StoreID); i >= 0 {
						replicas.MoveToFront(i)
						order = rpc.OrderStable
					}
				}

				return ds.sendRPC(desc.RaftID, replicas, order, args, curReply)
			}()
			if err != nil {
				// For an RPC error to occur, we must've been unable to contact any
				// replicas. In this case, likely all nodes are down (or not getting back
				// to us within a reasonable amount of time).
				// We may simply not be trying to talk to the up-to-date replicas, so
				// clearing the descriptor here should be a good idea.
				// TODO(tschottdorf): If a replica group goes dead, this will cause clients
				// to put high read pressure on the first range, so there should be some
				// rate limiting here.
				ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
			} else {
				err = curReply.Header().GoError()
			}

			if err != nil {
				if log.V(1) {
					log.Warningf("failed to invoke %s: %s", call.Method(), err)
				}

				// If retryable, allow retry. For range not found or range
				// key mismatch errors, we don't backoff on the retry,
				// but reset the backoff loop so we can retry immediately.
				switch tErr := err.(type) {
				case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError:
					// Range descriptor might be out of date - evict it.
					ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
					// On addressing errors, don't backoff; retry immediately.
					r.Reset()
					if log.V(1) {
						log.Warning(err)
					}
					continue
				case *proto.NotLeaderError:
					newLeader := tErr.GetLeader()
					// Verify that leader is a known replica according to the
					// descriptor. If not, we've got a stale replica; evict cache.
					// Next, cache the new leader.
					if newLeader != nil {
						if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 {
							if log.V(1) {
								log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc)
							}
							ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
						}
					} else {
						newLeader = &proto.Replica{}
					}
					ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader)
					if log.V(1) {
						log.Warning(err)
					}
					r.Reset()
					continue
				case util.Retryable:
					if tErr.CanRetry() {
						if log.V(1) {
							log.Warning(err)
						}
						continue
					}
				}
			}
			break
		}

		// Immediately return if querying a range failed non-retryably.
		// For multi-range requests, we return the failing range's reply.
		if err != nil {
			call.Reply.Header().SetGoError(err)
			return
		}

		if finalReply != curReply {
			// This was the second or later call in a multi-range request.
			// Combine the new response with the existing one.
			if cFinalReply, ok := finalReply.(proto.Combinable); ok {
				cFinalReply.Combine(curReply)
			} else {
				// This should never apply in practice, as we'll only end up here
				// for range-spanning requests.
				call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type"))
				return
			}
		}

		// If this request has a bound, such as MaxResults in
		// ScanRequest, check whether enough rows have been retrieved.
		if argsBounded {
			if prevBound := boundedArgs.GetBound(); prevBound > 0 {
				if cReply, ok := curReply.(proto.Countable); ok {
					if nextBound := prevBound - cReply.Count(); nextBound > 0 {
						// Update bound for the next round.
						// We've deferred restoring the original bound earlier.
						boundedArgs.SetBound(nextBound)
					} else {
						// Set flag to break the loop.
						descNext = nil
					}
				}
			}
		}

		// If this was the last range accessed by this call, exit loop.
		if descNext == nil {
			break
		}

		// In next iteration, query next range.
		// It's important that we use the EndKey of the current descriptor
		// as opposed to the StartKey of the next one: if the former is stale,
		// it's possible that the next range has since merged the subsequent
		// one, and unless both descriptors are stale, the next descriptor's
		// StartKey would move us to the beginning of the current range,
		// resulting in a duplicate scan.
		args.Header().Key = desc.EndKey

		// This is a multi-range request, make a new reply object for
		// subsequent iterations of the loop.
		curReply = args.CreateReply()
	}
	call.Reply = finalReply
}
Example #27
0
func TestRapidMembershipChange(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	defer stopper.Stop()

	var wg sync.WaitGroup
	proposers := 5

	numCommit := int32(200)

	cluster := newTestCluster(nil, 1, stopper, t)
	groupID := proto.RaftID(1)

	cluster.createGroup(groupID, 0, 1 /* replicas */)
	startSeq := int32(0) // updated atomically from now on

	cmdIDFormat := "%0" + fmt.Sprintf("%d", commandIDLen) + "d"
	teardown := make(chan struct{})

	proposerFn := func(i int) {
		defer wg.Done()

		var seq int32
		for {
			seq = atomic.AddInt32(&startSeq, 1)
			if seq > numCommit {
				break
			}
			cmdID := fmt.Sprintf(cmdIDFormat, seq)
		retry:
			for {
				if err := cluster.nodes[0].CreateGroup(groupID); err != nil {
					t.Fatal(err)
				}
				if log.V(1) {
					log.Infof("%-3d: try    %s", i, cmdID)
				}

				select {
				case err := <-cluster.nodes[0].SubmitCommand(groupID,
					cmdID, []byte("command")):
					if err == nil {
						log.Infof("%-3d: ok   %s", i, cmdID)
						break retry
					}
					log.Infof("%-3d: err  %s %s", i, cmdID, err)
				case <-teardown:
					return
				}
			}
			if err := cluster.nodes[0].RemoveGroup(groupID); err != nil {
				t.Fatal(err)
			}
		}
	}

	for i := 0; i < proposers; i++ {
		wg.Add(1)
		go proposerFn(i)
	}

	for e := range cluster.events[0].CommandCommitted {
		if log.V(1) {
			log.Infof("   : recv %s", e.CommandID)
		}
		if fmt.Sprintf(cmdIDFormat, numCommit) == e.CommandID {
			log.Infof("received everything we asked for, ending test")
			break
		}

	}
	close(teardown)
	// Because ending the test case is racy with the test itself, we wait until
	// all our goroutines have finished their work before we allow the test to
	// forcible terminate. This solves a race condition on `t`, which is
	// otherwise subject to concurrent access from our goroutine and the go
	// testing machinery.
	wg.Wait()
}
Example #28
0
// sendAttempt is invoked by Send and handles retry logic and cache eviction
// for a call sent to a single range. It returns a retry status, which is Break
// on success and either Break, Continue or Reset depending on error condition.
// This method is expected to be invoked from within a backoff / retry loop to
// retry the send repeatedly (e.g. to continue processing after a critical node
// becomes available after downtime or the range descriptor is refreshed via
// lookup).
func (ds *DistSender) sendAttempt(desc *proto.RangeDescriptor, call proto.Call) (retry.Status, error) {
	leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID))

	// Try to send the call.
	replicas := newReplicaSlice(ds.gossip, desc)

	// Rearrange the replicas so that those replicas with long common
	// prefix of attributes end up first. If there's no prefix, this is a
	// no-op.
	order := ds.optimizeReplicaOrder(replicas)

	args := call.Args
	reply := call.Reply

	// If this request needs to go to a leader and we know who that is, move
	// it to the front.
	if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) &&
		leader.StoreID > 0 {
		if i := replicas.FindReplica(leader.StoreID); i >= 0 {
			replicas.MoveToFront(i)
			order = rpc.OrderStable
		}
	}

	err := ds.sendRPC(desc.RaftID, replicas, order, args, reply)
	if err != nil {
		// For an RPC error to occur, we must've been unable to contact any
		// replicas. In this case, likely all nodes are down (or not getting back
		// to us within a reasonable amount of time).
		// We may simply not be trying to talk to the up-to-date replicas, so
		// clearing the descriptor here should be a good idea.
		// TODO(tschottdorf): If a replica group goes dead, this will cause clients
		// to put high read pressure on the first range, so there should be some
		// rate limiting here.
		ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
	} else {
		err = reply.Header().GoError()
	}

	if err != nil {
		if log.V(1) {
			log.Warningf("failed to invoke %s: %s", call.Method(), err)
		}

		// If retryable, allow retry. For range not found or range
		// key mismatch errors, we don't backoff on the retry,
		// but reset the backoff loop so we can retry immediately.
		switch tErr := err.(type) {
		case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError:
			// Range descriptor might be out of date - evict it.
			ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
			// On addressing errors, don't backoff; retry immediately.
			return retry.Reset, err
		case *proto.NotLeaderError:
			newLeader := tErr.GetLeader()
			// Verify that leader is a known replica according to the
			// descriptor. If not, we've got a stale replica; evict cache.
			// Next, cache the new leader.
			if newLeader != nil {
				if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 {
					if log.V(1) {
						log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc)
					}
					ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
				}
			} else {
				newLeader = &proto.Replica{}
			}
			ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader)
			return retry.Reset, err
		case util.Retryable:
			if tErr.CanRetry() {
				return retry.Continue, err
			}
		}
		return retry.Break, err
	}
	return retry.Break, nil
}
Example #29
0
func (m *LogEntry) Unmarshal(data []byte) error {
	l := len(data)
	iNdEx := 0
	for iNdEx < l {
		var wire uint64
		for shift := uint(0); ; shift += 7 {
			if iNdEx >= l {
				return io.ErrUnexpectedEOF
			}
			b := data[iNdEx]
			iNdEx++
			wire |= (uint64(b) & 0x7F) << shift
			if b < 0x80 {
				break
			}
		}
		fieldNum := int32(wire >> 3)
		wireType := int(wire & 0x7)
		switch fieldNum {
		case 1:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field Severity", wireType)
			}
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				m.Severity |= (int32(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
		case 2:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field Time", wireType)
			}
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				m.Time |= (int64(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
		case 3:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field ThreadID", wireType)
			}
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				m.ThreadID |= (int32(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
		case 4:
			if wireType != 2 {
				return fmt.Errorf("proto: wrong wireType = %d for field File", wireType)
			}
			var stringLen uint64
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				stringLen |= (uint64(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
			postIndex := iNdEx + int(stringLen)
			if postIndex > l {
				return io.ErrUnexpectedEOF
			}
			m.File = string(data[iNdEx:postIndex])
			iNdEx = postIndex
		case 5:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field Line", wireType)
			}
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				m.Line |= (int32(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
		case 6:
			if wireType != 2 {
				return fmt.Errorf("proto: wrong wireType = %d for field Format", wireType)
			}
			var stringLen uint64
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				stringLen |= (uint64(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
			postIndex := iNdEx + int(stringLen)
			if postIndex > l {
				return io.ErrUnexpectedEOF
			}
			m.Format = string(data[iNdEx:postIndex])
			iNdEx = postIndex
		case 7:
			if wireType != 2 {
				return fmt.Errorf("proto: wrong wireType = %d for field Args", wireType)
			}
			var msglen int
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				msglen |= (int(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
			postIndex := iNdEx + msglen
			if postIndex > l {
				return io.ErrUnexpectedEOF
			}
			m.Args = append(m.Args, LogEntry_Arg{})
			if err := m.Args[len(m.Args)-1].Unmarshal(data[iNdEx:postIndex]); err != nil {
				return err
			}
			iNdEx = postIndex
		case 8:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field NodeID", wireType)
			}
			var v github_com_cockroachdb_cockroach_proto.NodeID
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				v |= (github_com_cockroachdb_cockroach_proto.NodeID(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
			m.NodeID = &v
		case 9:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field StoreID", wireType)
			}
			var v github_com_cockroachdb_cockroach_proto.StoreID
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				v |= (github_com_cockroachdb_cockroach_proto.StoreID(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
			m.StoreID = &v
		case 10:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field RaftID", wireType)
			}
			var v github_com_cockroachdb_cockroach_proto.RaftID
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				v |= (github_com_cockroachdb_cockroach_proto.RaftID(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
			m.RaftID = &v
		case 11:
			if wireType != 0 {
				return fmt.Errorf("proto: wrong wireType = %d for field Method", wireType)
			}
			var v github_com_cockroachdb_cockroach_proto.Method
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				v |= (github_com_cockroachdb_cockroach_proto.Method(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
			m.Method = &v
		case 12:
			if wireType != 2 {
				return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType)
			}
			var byteLen int
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				byteLen |= (int(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
			postIndex := iNdEx + byteLen
			if postIndex > l {
				return io.ErrUnexpectedEOF
			}
			m.Key = append([]byte{}, data[iNdEx:postIndex]...)
			iNdEx = postIndex
		case 13:
			if wireType != 2 {
				return fmt.Errorf("proto: wrong wireType = %d for field Stacks", wireType)
			}
			var byteLen int
			for shift := uint(0); ; shift += 7 {
				if iNdEx >= l {
					return io.ErrUnexpectedEOF
				}
				b := data[iNdEx]
				iNdEx++
				byteLen |= (int(b) & 0x7F) << shift
				if b < 0x80 {
					break
				}
			}
			postIndex := iNdEx + byteLen
			if postIndex > l {
				return io.ErrUnexpectedEOF
			}
			m.Stacks = append([]byte{}, data[iNdEx:postIndex]...)
			iNdEx = postIndex
		default:
			var sizeOfWire int
			for {
				sizeOfWire++
				wire >>= 7
				if wire == 0 {
					break
				}
			}
			iNdEx -= sizeOfWire
			skippy, err := skipLog(data[iNdEx:])
			if err != nil {
				return err
			}
			if (iNdEx + skippy) > l {
				return io.ErrUnexpectedEOF
			}
			m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...)
			iNdEx += skippy
		}
	}

	return nil
}
Example #30
0
func TestStoreRangeSet(t *testing.T) {
	defer leaktest.AfterTest(t)
	store, _, stopper := createTestStore(t)
	defer stopper.Stop()

	// Remove range 1.
	rng1, err := store.GetRange(1)
	if err != nil {
		t.Error(err)
	}
	if err := store.RemoveRange(rng1); err != nil {
		t.Error(err)
	}
	// Add 10 new ranges.
	const newCount = 10
	for i := 0; i < newCount; i++ {
		rng := createRange(store, proto.RaftID(i+1), proto.Key(fmt.Sprintf("a%02d", i)), proto.Key(fmt.Sprintf("a%02d", i+1)))
		if err := store.AddRangeTest(rng); err != nil {
			t.Fatal(err)
		}
	}

	// Verify two passes of the visit.
	ranges := newStoreRangeSet(store)
	for pass := 0; pass < 2; pass++ {
		if ec := ranges.EstimatedCount(); ec != 10 {
			t.Errorf("expected 10 remaining; got %d", ec)
		}
		i := 1
		ranges.Visit(func(rng *Range) bool {
			if rng.Desc().RaftID != proto.RaftID(i) {
				t.Errorf("expected range with Raft ID %d; got %v", i, rng)
			}
			if ec := ranges.EstimatedCount(); ec != 10-i {
				t.Errorf("expected %d remaining; got %d", 10-i, ec)
			}
			i++
			return true
		})
		if ec := ranges.EstimatedCount(); ec != 10 {
			t.Errorf("expected 10 remaining; got %d", ec)
		}
	}

	// Try visiting with an addition and a removal.
	visited := make(chan struct{})
	updated := make(chan struct{})
	done := make(chan struct{})
	go func() {
		i := 1
		ranges.Visit(func(rng *Range) bool {
			if i == 1 {
				if rng.Desc().RaftID != proto.RaftID(i) {
					t.Errorf("expected range with Raft ID %d; got %v", i, rng)
				}
				close(visited)
				<-updated
			} else {
				// The second range will be removed and skipped.
				if rng.Desc().RaftID != proto.RaftID(i+1) {
					t.Errorf("expected range with Raft ID %d; got %v", i+1, rng)
				}
			}
			i++
			return true
		})
		if i != 10 {
			t.Errorf("expected visit of 9 ranges, but got %v", i-1)
		}
		close(done)
	}()

	<-visited
	if ec := ranges.EstimatedCount(); ec != 9 {
		t.Errorf("expected 9 remaining; got %d", ec)
	}

	// Split the first range to insert a new range as second range.
	// The range is never visited with this iteration.
	rng := createRange(store, 11, proto.Key("a000"), proto.Key("a01"))
	if err = store.SplitRange(store.LookupRange(proto.Key("a00"), nil), rng); err != nil {
		t.Fatal(err)
	}
	// Estimated count will still be 9, as it's cached.
	if ec := ranges.EstimatedCount(); ec != 9 {
		t.Errorf("expected 9 remaining; got %d", ec)
	}

	// Now, remove the next range in the iteration and verify we skip the removed range.
	rng = store.LookupRange(proto.Key("a01"), nil)
	if rng.Desc().RaftID != 2 {
		t.Errorf("expected fetch of raftID=2; got %d", rng.Desc().RaftID)
	}
	if err := store.RemoveRange(rng); err != nil {
		t.Error(err)
	}
	if ec := ranges.EstimatedCount(); ec != 9 {
		t.Errorf("expected 9 remaining; got %d", ec)
	}

	close(updated)
	<-done
}