예제 #1
0
// maybeSendLeaderEvent processes a raft.Ready to send events in response to leadership
// changes (this includes both sending an event to the app and retrying any pending
// proposals).
// It may call into Storage so it must be called with the storage lock held.
func (s *state) maybeSendLeaderEvent(groupID roachpb.RangeID, g *group, ready *raft.Ready) {
	s.assertStorageLocked()
	term := g.committedTerm
	if ready.SoftState != nil {
		// Always save the leader whenever it changes.
		if roachpb.ReplicaID(ready.SoftState.Lead) != g.leader.ReplicaID {
			if ready.SoftState.Lead == 0 {
				g.leader = roachpb.ReplicaDescriptor{}
			} else {
				if repl, err := s.ReplicaDescriptor(g.groupID, roachpb.ReplicaID(ready.SoftState.Lead)); err != nil {
					log.Warningf("node %s: failed to look up address of replica %d in group %d: %s",
						s.nodeID, ready.SoftState.Lead, g.groupID, err)
					g.leader = roachpb.ReplicaDescriptor{}
				} else {
					g.leader = repl
				}
			}
		}
	}
	if len(ready.CommittedEntries) > 0 {
		term = ready.CommittedEntries[len(ready.CommittedEntries)-1].Term
	}
	if term != g.committedTerm && g.leader.ReplicaID != 0 {
		// Whenever the committed term has advanced and we know our leader,
		// emit an event.
		g.committedTerm = term
		s.sendEvent(&EventLeaderElection{
			GroupID:   groupID,
			ReplicaID: g.leader.ReplicaID,
			Term:      g.committedTerm,
		})
	}
}
예제 #2
0
// TestRaftAfterRemoveRange verifies that the raft state removes
// a remote node correctly after the Replica was removed from the Store.
func TestRaftAfterRemoveRange(t *testing.T) {
	defer leaktest.AfterTest(t)
	mtc := startMultiTestContext(t, 3)
	defer mtc.Stop()

	// Make the split.
	splitArgs := adminSplitArgs(roachpb.KeyMin, []byte("b"))
	if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &splitArgs); err != nil {
		t.Fatal(err)
	}

	rangeID := roachpb.RangeID(2)
	mtc.replicateRange(rangeID, 1, 2)

	mtc.unreplicateRange(rangeID, 2)
	mtc.unreplicateRange(rangeID, 1)

	// Wait for the removal to be processed.
	util.SucceedsWithin(t, time.Second, func() error {
		_, err := mtc.stores[1].GetReplica(rangeID)
		if _, ok := err.(*roachpb.RangeNotFoundError); ok {
			return nil
		} else if err != nil {
			return err
		}
		return util.Errorf("range still exists")
	})

	replica1 := roachpb.ReplicaDescriptor{
		ReplicaID: roachpb.ReplicaID(mtc.stores[1].StoreID()),
		NodeID:    roachpb.NodeID(mtc.stores[1].StoreID()),
		StoreID:   mtc.stores[1].StoreID(),
	}
	replica2 := roachpb.ReplicaDescriptor{
		ReplicaID: roachpb.ReplicaID(mtc.stores[2].StoreID()),
		NodeID:    roachpb.NodeID(mtc.stores[2].StoreID()),
		StoreID:   mtc.stores[2].StoreID(),
	}
	if err := mtc.transport.Send(&storage.RaftMessageRequest{
		GroupID:     0,
		ToReplica:   replica1,
		FromReplica: replica2,
		Message: raftpb.Message{
			From: uint64(replica2.ReplicaID),
			To:   uint64(replica1.ReplicaID),
			Type: raftpb.MsgHeartbeat,
		}}); err != nil {
		t.Fatal(err)
	}
	// Execute another replica change to ensure that raft has processed
	// the heartbeat just sent.
	mtc.replicateRange(roachpb.RangeID(1), 1)

	// Expire leases to ensure any remaining intent resolutions can complete.
	// TODO(bdarnell): understand why some tests need this.
	mtc.expireLeaderLeases()
}
예제 #3
0
func TestMembershipChange(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	cluster := newTestCluster(nil, 4, stopper, t)
	defer stopper.Stop()

	// Create a group with a single member, cluster.nodes[0].
	groupID := roachpb.RangeID(1)
	cluster.createGroup(groupID, 0, 1)
	// An automatic election is triggered since this is a single-node Raft group,
	// so we don't need to call triggerElection.

	// Consume and apply the membership change events.
	for i := 0; i < 4; i++ {
		go func(i int) {
			for {
				e, ok := <-cluster.events[i].MembershipChangeCommitted
				if !ok {
					return
				}
				e.Callback(nil)
			}
		}(i)
	}

	// Add each of the other three nodes to the cluster.
	for i := 1; i < 4; i++ {
		ch := cluster.nodes[0].ChangeGroupMembership(groupID, makeCommandID(),
			raftpb.ConfChangeAddNode,
			roachpb.ReplicaDescriptor{
				NodeID:    cluster.nodes[i].nodeID,
				StoreID:   roachpb.StoreID(cluster.nodes[i].nodeID),
				ReplicaID: roachpb.ReplicaID(cluster.nodes[i].nodeID),
			}, nil)
		<-ch
	}

	// TODO(bdarnell): verify that the channel events are sent out correctly.
	/*
		for i := 0; i < 10; i++ {
			log.Infof("tick %d", i)
			cluster.tickers[0].Tick()
			time.Sleep(5 * time.Millisecond)
		}

		// Each node is notified of each other node's joining.
		for i := 0; i < 4; i++ {
			for j := 1; j < 4; j++ {
				select {
				case e := <-cluster.events[i].MembershipChangeCommitted:
					if e.NodeID != cluster.nodes[j].nodeID {
						t.Errorf("node %d expected event for %d, got %d", i, j, e.NodeID)
					}
				default:
					t.Errorf("node %d did not get expected event for %d", i, j)
				}
			}
		}*/
}
예제 #4
0
// elect is a simplified wrapper around triggerElection and waitForElection which
// waits for the election to complete on all members of a group.
// TODO(bdarnell): make this work when membership has been changed after creation.
func (c *testCluster) elect(leaderIndex int, groupID roachpb.RangeID) {
	c.triggerElection(leaderIndex, groupID)
	for _, i := range c.groups[groupID] {
		el := c.waitForElection(i)
		// With the in-memory storage used in these tests, replica and node IDs are interchangeable.
		if el.ReplicaID != roachpb.ReplicaID(c.nodes[leaderIndex].nodeID) {
			c.t.Fatalf("wrong leader elected; wanted node %d but got event %v", leaderIndex, el)
		}
		if el.GroupID != groupID {
			c.t.Fatalf("expected election event for group %d but got %d", groupID, el.GroupID)
		}
	}
}
예제 #5
0
// TestReproposeConfigChange verifies the behavior when multiple
// configuration changes are in flight at once. Raft prohibits this,
// but any configuration changes that are dropped by this rule should
// be reproposed when the previous change completes.
func TestReproposeConfigChange(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	defer stopper.Stop()
	const clusterSize = 4
	const groupSize = 3
	cluster := newTestCluster(nil, clusterSize, stopper, t)

	const groupID = roachpb.RangeID(1)
	const leader = 0
	const proposer = 1
	cluster.createGroup(groupID, leader, groupSize)
	cluster.elect(leader, groupID)

	targetDesc := roachpb.ReplicaDescriptor{
		NodeID:    cluster.nodes[groupSize].nodeID,
		StoreID:   roachpb.StoreID(cluster.nodes[groupSize].nodeID),
		ReplicaID: roachpb.ReplicaID(cluster.nodes[groupSize].nodeID),
	}
	// Add a node and immediately remove it without waiting for the
	// first change to commit.
	addErrCh := cluster.nodes[proposer].ChangeGroupMembership(groupID, makeCommandID(),
		raftpb.ConfChangeAddNode, targetDesc, nil)
	removeErrCh := cluster.nodes[proposer].ChangeGroupMembership(groupID, makeCommandID(),
		raftpb.ConfChangeRemoveNode, targetDesc, nil)

	// The add command will commit first; then it needs to be applied.
	// Apply it on the proposer node before the leader.
	e := <-cluster.events[proposer].MembershipChangeCommitted
	e.Callback(nil)
	e = <-cluster.events[leader].MembershipChangeCommitted
	e.Callback(nil)

	// Now wait for both commands to commit.
	select {
	case err := <-addErrCh:
		if err != nil {
			t.Errorf("add failed: %s", err)
		}
	case <-time.After(time.Second):
		t.Errorf("add timed out")
	}
	select {
	case err := <-removeErrCh:
		if err != nil {
			t.Errorf("remove failed: %s", err)
		}
	case <-time.After(time.Second):
		t.Errorf("remove timed out")
	}
}
예제 #6
0
파일: storage.go 프로젝트: ruo91/cockroach
// ReplicaIDForStore implements the Storage interface.
func (m *MemoryStorage) ReplicaIDForStore(groupID roachpb.RangeID, storeID roachpb.StoreID) (roachpb.ReplicaID, error) {
	return roachpb.ReplicaID(storeID), nil
}
예제 #7
0
// TestInOrderDelivery verifies that for a given pair of nodes, raft
// messages are delivered in order.
func TestInOrderDelivery(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	defer stopper.Stop()
	nodeRPCContext := rpc.NewContext(nodeTestBaseContext, hlc.NewClock(hlc.UnixNano), stopper)
	g := gossip.New(nodeRPCContext, gossip.TestBootstrap, stopper)
	g.SetNodeID(roachpb.NodeID(1))

	rpcServer := rpc.NewServer(nodeRPCContext)
	grpcServer := grpc.NewServer()
	tlsConfig, err := nodeRPCContext.GetServerTLSConfig()
	if err != nil {
		t.Fatal(err)
	}
	ln, err := util.ListenAndServe(stopper, grpcutil.GRPCHandlerFunc(grpcServer, rpcServer), util.CreateTestAddr("tcp"), tlsConfig)
	if err != nil {
		t.Fatal(err)
	}

	const numMessages = 100
	nodeID := roachpb.NodeID(roachpb.NodeID(2))
	serverTransport := newRPCTransport(g, grpcServer, nodeRPCContext)
	defer serverTransport.Close()
	serverChannel := newChannelServer(numMessages, 10*time.Millisecond)
	if err := serverTransport.Listen(roachpb.StoreID(nodeID), serverChannel.RaftMessage); err != nil {
		t.Fatal(err)
	}
	addr := ln.Addr()
	// Have to set gossip.NodeID before call gossip.AddInofXXX
	g.SetNodeID(nodeID)
	if err := g.AddInfoProto(gossip.MakeNodeIDKey(nodeID),
		&roachpb.NodeDescriptor{
			Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()),
		},
		time.Hour); err != nil {
		t.Fatal(err)
	}

	clientNodeID := roachpb.NodeID(2)
	clientTransport := newRPCTransport(g, nil, nodeRPCContext)
	defer clientTransport.Close()

	for i := 0; i < numMessages; i++ {
		req := &storage.RaftMessageRequest{
			GroupID: 1,
			Message: raftpb.Message{
				To:     uint64(nodeID),
				From:   uint64(clientNodeID),
				Commit: uint64(i),
			},
			ToReplica: roachpb.ReplicaDescriptor{
				NodeID:    nodeID,
				StoreID:   roachpb.StoreID(nodeID),
				ReplicaID: roachpb.ReplicaID(nodeID),
			},
			FromReplica: roachpb.ReplicaDescriptor{
				NodeID:    clientNodeID,
				StoreID:   roachpb.StoreID(clientNodeID),
				ReplicaID: roachpb.ReplicaID(clientNodeID),
			},
		}
		if err := clientTransport.Send(req); err != nil {
			t.Errorf("failed to send message %d: %s", i, err)
		}
	}

	for i := 0; i < numMessages; i++ {
		req := <-serverChannel.ch
		if req.Message.Commit != uint64(i) {
			t.Errorf("messages out of order: got %d while expecting %d", req.Message.Commit, i)
		}
	}
}
예제 #8
0
// TestRemoveLeader ensures that a group will recover if a node is
// removed from the group while it is leader. Since visibility into
// the raft state is limited, we create a three-node group in a
// six-node cluster. This group is migrated one node at a time from
// the first three nodes to the last three. In the process the initial
// leader must have removed itself.
func TestRemoveLeader(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	const clusterSize = 6
	const groupSize = 3
	cluster := newTestCluster(nil, clusterSize, stopper, t)
	defer stopper.Stop()

	// Consume and apply the membership change events.
	for i := 0; i < clusterSize; i++ {
		go func(i int) {
			for {
				if e, ok := <-cluster.events[i].MembershipChangeCommitted; ok {
					e.Callback(nil)
				} else {
					return
				}
			}
		}(i)
	}

	// Tick all the clocks in the background to ensure that all the
	// necessary elections are triggered.
	// TODO(bdarnell): newTestCluster should have an option to use a
	// real clock instead of a manual one.
	stopper.RunWorker(func() {
		ticker := time.NewTicker(10 * time.Millisecond)
		defer ticker.Stop()
		for {
			select {
			case <-stopper.ShouldStop():
				return
			case <-ticker.C:
				for _, t := range cluster.tickers {
					t.NonBlockingTick()
				}
			}
		}
	})

	// Create a group with three members.
	groupID := roachpb.RangeID(1)
	cluster.createGroup(groupID, 0, groupSize)

	// Move the group one node at a time from the first three nodes to
	// the last three. In the process, we necessarily remove the leader
	// and trigger at least one new election among the new nodes.
	for i := 0; i < groupSize; i++ {
		log.Infof("adding node %d", i+groupSize)
		ch := cluster.nodes[i].ChangeGroupMembership(groupID, makeCommandID(),
			raftpb.ConfChangeAddNode,
			roachpb.ReplicaDescriptor{
				NodeID:    cluster.nodes[i+groupSize].nodeID,
				StoreID:   roachpb.StoreID(cluster.nodes[i+groupSize].nodeID),
				ReplicaID: roachpb.ReplicaID(cluster.nodes[i+groupSize].nodeID),
			}, nil)
		if err := <-ch; err != nil {
			t.Fatal(err)
		}

		log.Infof("removing node %d", i)
		ch = cluster.nodes[i].ChangeGroupMembership(groupID, makeCommandID(),
			raftpb.ConfChangeRemoveNode,
			roachpb.ReplicaDescriptor{
				NodeID:    cluster.nodes[i].nodeID,
				StoreID:   roachpb.StoreID(cluster.nodes[i].nodeID),
				ReplicaID: roachpb.ReplicaID(cluster.nodes[i].nodeID),
			}, nil)
		if err := <-ch; err != nil {
			t.Fatal(err)
		}
	}
}
예제 #9
0
// createGroup is called in two situations: by the application at
// startup (in which case the replicaID argument is zero and the
// replicaID will be loaded from storage), and in response to incoming
// messages (in which case the replicaID comes from the incoming
// message, since nothing is on disk yet).
func (s *state) createGroup(groupID roachpb.RangeID, replicaID roachpb.ReplicaID) error {
	if _, ok := s.groups[groupID]; ok {
		return nil
	}
	if log.V(3) {
		log.Infof("node %v creating group %v", s.nodeID, groupID)
	}

	gs, err := s.Storage.GroupStorage(groupID, replicaID)
	if err != nil {
		return err
	}
	_, cs, err := gs.InitialState()
	if err != nil {
		return err
	}

	// Find our store ID in the replicas list.
	for _, r := range cs.Nodes {
		repDesc, err := s.ReplicaDescriptor(groupID, roachpb.ReplicaID(r))
		if err != nil {
			return err
		}
		if repDesc.StoreID == s.storeID {
			if replicaID == 0 {
				replicaID = repDesc.ReplicaID
			} else if replicaID != repDesc.ReplicaID {
				return util.Errorf("inconsistent replica ID: passed %d, but found %s by scanning ConfState for store %s",
					replicaID, repDesc.ReplicaID, s.storeID)
			}
			replicaID = repDesc.ReplicaID
			break
		}
	}
	if replicaID == 0 {
		return util.Errorf("couldn't find replica ID for this store (%s) in range %d",
			s.storeID, groupID)
	}
	s.CacheReplicaDescriptor(groupID, roachpb.ReplicaDescriptor{
		ReplicaID: replicaID,
		NodeID:    s.nodeID,
		StoreID:   s.storeID,
	})

	var appliedIndex uint64
	if s.StateMachine != nil {
		appliedIndex, err = s.StateMachine.AppliedIndex(groupID)
		if err != nil {
			return err
		}
	}

	raftCfg := &raft.Config{
		ID:            uint64(replicaID),
		Applied:       appliedIndex,
		ElectionTick:  s.ElectionTimeoutTicks,
		HeartbeatTick: s.HeartbeatIntervalTicks,
		Storage:       gs,
		// TODO(bdarnell): make these configurable; evaluate defaults.
		MaxSizePerMsg:   1024 * 1024,
		MaxInflightMsgs: 256,
		Logger:          &raftLogger{group: uint64(groupID)},
	}
	if err := s.multiNode.CreateGroup(uint64(groupID), raftCfg, nil); err != nil {
		return err
	}
	g := &group{
		id:      groupID,
		pending: map[string]*proposal{},
	}
	s.groups[groupID] = g

	for _, id := range cs.Nodes {
		replicaID := roachpb.ReplicaID(id)
		replica, err := s.ReplicaDescriptor(groupID, replicaID)
		if err != nil {
			return err
		}

		if err := s.addNode(replica.NodeID, g); err != nil {
			return err
		}
	}

	// Automatically campaign and elect a leader for this group if there's
	// exactly one known node for this group.
	//
	// A grey area for this being correct happens in the case when we're
	// currently in the progress of adding a second node to the group,
	// with the change committed but not applied.
	// Upon restarting, the node would immediately elect itself and only
	// then apply the config change, where really it should be applying
	// first and then waiting for the majority (which would now require
	// two votes, not only its own).
	// However, in that special case, the second node has no chance to
	// be elected master while this node restarts (as it's aware of the
	// configuration and knows it needs two votes), so the worst that
	// could happen is both nodes ending up in candidate state, timing
	// out and then voting again. This is expected to be an extremely
	// rare event.
	if len(cs.Nodes) == 1 {
		replica, err := s.ReplicaDescriptor(groupID, roachpb.ReplicaID(cs.Nodes[0]))
		if err != nil {
			return err
		}
		if replica.StoreID == s.storeID {
			log.Infof("node %s campaigning because initial confstate is %v", s.nodeID, cs.Nodes)
			if err := s.multiNode.Campaign(context.Background(), uint64(groupID)); err != nil {
				return err
			}
		}
	}
	return nil
}
예제 #10
0
// sendMessage sends a raft message on the given group. Coalesced heartbeats
// address nodes, not groups; they will use the noGroup constant as groupID.
func (s *state) sendMessage(g *group, msg raftpb.Message) {
	if log.V(6) {
		log.Infof("node %v sending message %.200s to %v", s.nodeID,
			raft.DescribeMessage(msg, s.EntryFormatter), msg.To)
	}
	groupID := noGroup
	var toReplica roachpb.ReplicaDescriptor
	var fromReplica roachpb.ReplicaDescriptor
	if g == nil {
		// No group (a coalesced heartbeat): To/From fields are NodeIDs.
		// TODO(bdarnell): test transports route by store ID, not node ID.
		// In tests they're always the same, so we can hack it here but
		// it would be better to fix the transports.
		// I think we need to fix this before we can support a range
		// with two replicas on different stores of the same node.
		toReplica.NodeID = roachpb.NodeID(msg.To)
		toReplica.StoreID = roachpb.StoreID(msg.To)
		fromReplica.NodeID = roachpb.NodeID(msg.From)
		fromReplica.StoreID = roachpb.StoreID(msg.From)
	} else {
		// Regular message: To/From fields are replica IDs.
		groupID = g.id
		var err error
		toReplica, err = s.ReplicaDescriptor(groupID, roachpb.ReplicaID(msg.To))
		if err != nil {
			log.Warningf("failed to lookup recipient replica %d in group %d: %s", msg.To, groupID, err)
			return
		}
		fromReplica, err = s.ReplicaDescriptor(groupID, roachpb.ReplicaID(msg.From))
		if err != nil {
			log.Warningf("failed to lookup sender replica %d in group %d: %s", msg.From, groupID, err)
			return
		}
	}
	if _, ok := s.nodes[toReplica.NodeID]; !ok {
		if log.V(4) {
			log.Infof("node %v: connecting to new node %v", s.nodeID, toReplica.NodeID)
		}
		if err := s.addNode(toReplica.NodeID, g); err != nil {
			log.Errorf("node %v: error adding group %v to node %v: %v",
				s.nodeID, groupID, toReplica.NodeID, err)
		}
	}
	err := s.Transport.Send(&RaftMessageRequest{
		GroupID:     groupID,
		ToReplica:   toReplica,
		FromReplica: fromReplica,
		Message:     msg,
	})
	snapStatus := raft.SnapshotFinish
	if err != nil {
		log.Warningf("node %v failed to send message to %v: %s", s.nodeID, toReplica.NodeID, err)
		if groupID != noGroup {
			s.multiNode.ReportUnreachable(msg.To, uint64(groupID))
		}
		snapStatus = raft.SnapshotFailure
	}
	if msg.Type == raftpb.MsgSnap {
		// TODO(bdarnell): add an ack for snapshots and don't report status until
		// ack, error, or timeout.
		if groupID != noGroup {
			s.multiNode.ReportSnapshot(msg.To, uint64(groupID), snapStatus)
		}
	}
}
예제 #11
0
// processCommittedEntry tells the application that a command was committed.
// Returns the commandID, or an empty string if the given entry was not a command.
func (s *state) processCommittedEntry(groupID roachpb.RangeID, g *group, entry raftpb.Entry) string {
	var commandID string
	switch entry.Type {
	case raftpb.EntryNormal:
		// etcd raft occasionally adds a nil entry (e.g. upon election); ignore these.
		if entry.Data != nil {
			var command []byte
			commandID, command = decodeCommand(entry.Data)
			s.sendEvent(&EventCommandCommitted{
				GroupID:   groupID,
				CommandID: commandID,
				Command:   command,
				Index:     entry.Index,
			})
		}

	case raftpb.EntryConfChange:
		cc := raftpb.ConfChange{}
		if err := cc.Unmarshal(entry.Data); err != nil {
			log.Fatalf("invalid ConfChange data: %s", err)
		}
		var payload []byte
		if len(cc.Context) > 0 {
			var ctx ConfChangeContext
			if err := ctx.Unmarshal(cc.Context); err != nil {
				log.Fatalf("invalid ConfChangeContext: %s", err)
			}
			commandID = ctx.CommandID
			payload = ctx.Payload
			s.CacheReplicaDescriptor(groupID, ctx.Replica)
		}
		replica, err := s.ReplicaDescriptor(groupID, roachpb.ReplicaID(cc.NodeID))
		if err != nil {
			// TODO(bdarnell): stash Replica information somewhere so we can have it here
			// with no chance of failure.
			log.Fatalf("could not look up replica info (node %s, group %d, replica %d): %s",
				s.nodeID, groupID, cc.NodeID, err)
		}
		g.waitForCallback++
		s.sendEvent(&EventMembershipChangeCommitted{
			GroupID:    groupID,
			CommandID:  commandID,
			Index:      entry.Index,
			Replica:    replica,
			ChangeType: cc.Type,
			Payload:    payload,
			Callback: func(err error) {
				var errStr string
				if err != nil {
					errStr = err.Error() // can't leak err into the callback
				}
				select {
				case s.callbackChan <- func() {
					if errStr == "" {
						if log.V(3) {
							log.Infof("node %v applying configuration change %v", s.nodeID, cc)
						}
						// TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs
						var err error
						switch cc.Type {
						case raftpb.ConfChangeAddNode:
							err = s.addNode(replica.NodeID, g)
						case raftpb.ConfChangeRemoveNode:
							err = s.removeNode(replica.NodeID, g)
						case raftpb.ConfChangeUpdateNode:
							// Updates don't concern multiraft, they are simply passed through.
						}
						if err != nil {
							log.Errorf("error applying configuration change %v: %s", cc, err)
						}
						s.multiNode.ApplyConfChange(uint64(groupID), cc)
					} else {
						log.Warningf("aborting configuration change: %s", errStr)
						s.multiNode.ApplyConfChange(uint64(groupID),
							raftpb.ConfChange{})
					}

					// Re-submit all pending proposals that were held
					// while the config change was pending
					g.waitForCallback--
					if g.waitForCallback <= 0 {
						for _, prop := range g.pending {
							s.propose(prop)
						}
					}
				}:
				case <-s.stopper.ShouldStop():
				}
			},
		})
	}
	return commandID
}
예제 #12
0
// TestInOrderDelivery verifies that for a given pair of nodes, raft
// messages are delivered in order.
func TestInOrderDelivery(t *testing.T) {
	defer leaktest.AfterTest(t)()
	stopper := stop.NewStopper()
	defer stopper.Stop()
	nodeRPCContext := rpc.NewContext(testutils.NewNodeTestBaseContext(), nil, stopper)
	g := gossip.New(nodeRPCContext, nil, stopper)

	grpcServer := rpc.NewServer(nodeRPCContext)
	ln, err := util.ListenAndServeGRPC(stopper, grpcServer, util.TestAddr)
	if err != nil {
		t.Fatal(err)
	}

	const numMessages = 100
	nodeID := roachpb.NodeID(roachpb.NodeID(2))
	serverTransport := storage.NewRaftTransport(storage.GossipAddressResolver(g), grpcServer, nodeRPCContext)
	serverChannel := newChannelServer(numMessages, 10*time.Millisecond)
	serverTransport.Listen(roachpb.StoreID(nodeID), serverChannel.RaftMessage)
	addr := ln.Addr()
	// Have to set gossip.NodeID before call gossip.AddInofXXX.
	g.SetNodeID(nodeID)
	if err := g.AddInfoProto(gossip.MakeNodeIDKey(nodeID),
		&roachpb.NodeDescriptor{
			Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()),
		},
		time.Hour); err != nil {
		t.Fatal(err)
	}

	clientNodeID := roachpb.NodeID(2)
	clientTransport := storage.NewRaftTransport(storage.GossipAddressResolver(g), nil, nodeRPCContext)

	for i := 0; i < numMessages; i++ {
		req := &storage.RaftMessageRequest{
			GroupID: 1,
			Message: raftpb.Message{
				To:     uint64(nodeID),
				From:   uint64(clientNodeID),
				Commit: uint64(i),
			},
			ToReplica: roachpb.ReplicaDescriptor{
				NodeID:    nodeID,
				StoreID:   roachpb.StoreID(nodeID),
				ReplicaID: roachpb.ReplicaID(nodeID),
			},
			FromReplica: roachpb.ReplicaDescriptor{
				NodeID:    clientNodeID,
				StoreID:   roachpb.StoreID(clientNodeID),
				ReplicaID: roachpb.ReplicaID(clientNodeID),
			},
		}
		if err := clientTransport.Send(req); err != nil {
			t.Errorf("failed to send message %d: %s", i, err)
		}
	}

	for i := 0; i < numMessages; i++ {
		req := <-serverChannel.ch
		if req.Message.Commit != uint64(i) {
			t.Errorf("messages out of order: got %d while expecting %d", req.Message.Commit, i)
		}
	}
}
예제 #13
0
// processCommittedEntry tells the application that a command was committed.
// Returns the commandID, or an empty string if the given entry was not a command.
func (s *state) processCommittedEntry(groupID roachpb.RangeID, g *group, entry raftpb.Entry) string {
	var commandID string
	switch entry.Type {
	case raftpb.EntryNormal:
		var command []byte
		commandID, command = decodeCommand(entry.Data)
		s.sendEvent(&EventCommandCommitted{
			GroupID:   groupID,
			CommandID: commandID,
			Command:   command,
			Index:     entry.Index,
		})

	case raftpb.EntryConfChange:
		cc := raftpb.ConfChange{}
		if err := cc.Unmarshal(entry.Data); err != nil {
			log.Fatalf("invalid ConfChange data: %s", err)
		}
		var payload []byte
		if len(cc.Context) > 0 {
			var ctx ConfChangeContext
			if err := ctx.Unmarshal(cc.Context); err != nil {
				log.Fatalf("invalid ConfChangeContext: %s", err)
			}
			commandID = ctx.CommandID
			payload = ctx.Payload
			s.CacheReplicaDescriptor(groupID, ctx.Replica)
		}
		replica, err := s.ReplicaDescriptor(groupID, roachpb.ReplicaID(cc.NodeID))
		if err != nil {
			// TODO(bdarnell): stash Replica information somewhere so we can have it here
			// with no chance of failure.
			log.Fatalf("could not look up replica info (node %s, group %d, replica %d): %s",
				s.nodeID, groupID, cc.NodeID, err)
		}
		s.sendEvent(&EventMembershipChangeCommitted{
			GroupID:    groupID,
			CommandID:  commandID,
			Index:      entry.Index,
			Replica:    replica,
			ChangeType: cc.Type,
			Payload:    payload,
			Callback: func(err error) {
				select {
				case s.callbackChan <- func() {
					gInner, ok := s.groups[groupID]
					if !ok {
						log.Infof("group %d no longer exists, aborting configuration change", groupID)
					} else if gInner != g {
						log.Infof("passed in group and fetched group objects do not match\noriginal:%+v\nfetched:%+v\n, aborting configuration change",
							g, gInner)
					} else if err == nil {
						if log.V(3) {
							log.Infof("node %v applying configuration change %v", s.nodeID, cc)
						}
						// TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs
						switch cc.Type {
						case raftpb.ConfChangeAddNode:
							err = s.addNode(replica.NodeID, g)
						case raftpb.ConfChangeRemoveNode:
							err = s.removeNode(replica.NodeID, g)
						case raftpb.ConfChangeUpdateNode:
							// Updates don't concern multiraft, they are simply passed through.
						}
						if err != nil {
							log.Errorf("error applying configuration change %v: %s", cc, err)
						}
						g.raftGroup.ApplyConfChange(cc)
					} else {
						log.Warningf("aborting configuration change: %s", err)
						g.raftGroup.ApplyConfChange(raftpb.ConfChange{})
					}
				}:
				case <-s.stopper.ShouldStop():
				}
			},
		})
	}
	return commandID
}
예제 #14
0
func TestSendAndReceive(t *testing.T) {
	defer leaktest.AfterTest(t)
	stopper := stop.NewStopper()
	defer stopper.Stop()
	nodeRPCContext := rpc.NewContext(nodeTestBaseContext, hlc.NewClock(hlc.UnixNano), stopper)
	g := gossip.New(nodeRPCContext, gossip.TestInterval, gossip.TestBootstrap)

	// Create several servers, each of which has two stores (A multiraft node ID addresses
	// a store).
	const numServers = 3
	const storesPerServer = 2
	const numStores = numServers * storesPerServer
	// servers has length numServers.
	servers := []*rpc.Server{}
	// All the rest have length numStores (note that several stores share a transport).
	nextNodeID := roachpb.NodeID(1)
	nodeIDs := []roachpb.NodeID{}
	transports := []multiraft.Transport{}
	channels := []channelServer{}
	for serverIndex := 0; serverIndex < numServers; serverIndex++ {
		server := rpc.NewServer(util.CreateTestAddr("tcp"), nodeRPCContext)
		if err := server.Start(); err != nil {
			t.Fatal(err)
		}
		defer server.Close()

		transport, err := newRPCTransport(g, server, nodeRPCContext)
		if err != nil {
			t.Fatalf("Unexpected error creating transport, Error: %s", err)
		}
		defer transport.Close()

		for store := 0; store < storesPerServer; store++ {
			nodeID := nextNodeID
			nextNodeID++

			channel := newChannelServer(10, 0)
			if err := transport.Listen(roachpb.StoreID(nodeID), channel); err != nil {
				t.Fatal(err)
			}

			addr := server.Addr()
			if err := g.AddInfoProto(gossip.MakeNodeIDKey(nodeID),
				&roachpb.NodeDescriptor{
					Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()),
				},
				time.Hour); err != nil {
				t.Fatal(err)
			}

			nodeIDs = append(nodeIDs, nodeID)
			transports = append(transports, transport)
			channels = append(channels, channel)
		}

		servers = append(servers, server)
	}

	// Each store sends one message to each store.
	for from := 0; from < numStores; from++ {
		for to := 0; to < numStores; to++ {
			req := &multiraft.RaftMessageRequest{
				GroupID: 1,
				Message: raftpb.Message{
					From: uint64(nodeIDs[from]),
					To:   uint64(nodeIDs[to]),
					Type: raftpb.MsgHeartbeat,
				},
				FromReplica: roachpb.ReplicaDescriptor{
					NodeID:    nodeIDs[from],
					StoreID:   roachpb.StoreID(nodeIDs[from]),
					ReplicaID: roachpb.ReplicaID(nodeIDs[from]),
				},
				ToReplica: roachpb.ReplicaDescriptor{
					NodeID:    nodeIDs[to],
					StoreID:   roachpb.StoreID(nodeIDs[to]),
					ReplicaID: roachpb.ReplicaID(nodeIDs[to]),
				},
			}

			if err := transports[from].Send(req); err != nil {
				t.Errorf("Unable to send message from %d to %d: %s", nodeIDs[from], nodeIDs[to], err)
			}
		}
	}

	// Read all the messages from the channels. Note that the transport
	// does not guarantee in-order delivery between independent
	// transports, so we just verify that the right number of messages
	// end up in each channel.
	for to := 0; to < numStores; to++ {
		for from := 0; from < numStores; from++ {
			select {
			case req := <-channels[to].ch:
				if req.Message.To != uint64(nodeIDs[to]) {
					t.Errorf("invalid message received on channel %d (expected from %d): %+v",
						nodeIDs[to], nodeIDs[from], req)
				}
			case <-time.After(5 * time.Second):
				t.Fatal("timed out waiting for message")
			}
		}

		select {
		case req := <-channels[to].ch:
			t.Errorf("got unexpected message %+v on channel %d", req, nodeIDs[to])
		default:
		}
	}
}
예제 #15
0
const (
	noGroup = roachpb.RangeID(0)

	reqBufferSize = 100

	// TODO(bdarnell): Determine the right size for this cache. Should
	// the cache be partitioned so that replica descriptors from the
	// range descriptors (which are the bulk of the data and can be
	// reloaded from disk as needed) don't crowd out the
	// message/snapshot descriptors (whose necessity is short-lived but
	// cannot be recovered through other means if evicted)?
	maxReplicaDescCacheSize = 1000

	// InvalidReplicaID is passed to the GroupStorage method when a
	// replica ID cannot be determined.
	InvalidReplicaID = roachpb.ReplicaID(-1)
)

// An ErrGroupDeleted is returned for commands which are pending while their
// group is deleted.
var ErrGroupDeleted = errors.New("raft group deleted")

// ErrStopped is returned for commands that could not be completed before the
// node was stopped.
var ErrStopped = errors.New("raft processing stopped")

// Config contains the parameters necessary to construct a MultiRaft object.
type Config struct {
	Storage   Storage
	Transport Transport
	// Ticker may be nil to use real time and TickInterval.