Esempio n. 1
0
// maybeSendLeaderEvent processes a raft.Ready to send events in response to leadership
// changes (this includes both sending an event to the app and retrying any pending
// proposals).
func (s *state) maybeSendLeaderEvent(groupID proto.RaftID, g *group, ready *raft.Ready) {
	term := g.committedTerm
	if ready.SoftState != nil {
		// Always save the leader whenever we get a SoftState.
		g.leader = proto.RaftNodeID(ready.SoftState.Lead)
	}
	if len(ready.CommittedEntries) > 0 {
		term = ready.CommittedEntries[len(ready.CommittedEntries)-1].Term
	}
	if term != g.committedTerm && g.leader != 0 {
		// Whenever the committed term has advanced and we know our leader,
		// emit an event.
		g.committedTerm = term
		s.sendEvent(&EventLeaderElection{
			GroupID: groupID,
			NodeID:  proto.RaftNodeID(g.leader),
			Term:    g.committedTerm,
		})

		// Re-submit all pending proposals
		for _, prop := range g.pending {
			s.propose(prop)
		}
	}
}
Esempio n. 2
0
func (lt *localRPCTransport) Send(req *RaftMessageRequest) error {
	client, err := lt.getClient(proto.RaftNodeID(req.Message.To))
	if err != nil {
		return err
	}
	call := client.Go(raftMessageName, req, &RaftMessageResponse{}, nil)
	select {
	case <-call.Done:
		// If the call failed synchronously, report an error.
		return call.Error
	default:
		// Otherwise, fire-and-forget.
		go func() {
			select {
			case <-call.Done:
			case <-lt.closed:
				return
			}
			if call.Error != nil {
				log.Errorf("sending rpc failed: %s", call.Error)
			}
		}()
		return nil
	}
}
Esempio n. 3
0
func (lt *localInterceptableTransport) start() {
	lt.stopper.RunWorker(func() {
		for {
			select {
			case msg := <-lt.messages:
				ack := make(chan struct{})
				iMsg := &interceptMessage{
					args: msg,
					ack:  ack,
				}
				// The following channel ops are not protected by a select with ShouldStop
				// since leaving things partially complete here could prevent other components
				// from shutting down cleanly.
				lt.Events <- iMsg
				<-ack
				lt.mu.Lock()
				srv, ok := lt.listeners[proto.RaftNodeID(msg.Message.To)]
				lt.mu.Unlock()
				if !ok {
					continue
				}
				err := srv.RaftMessage(msg, nil)
				if err == ErrStopped {
					return
				} else if err != nil {
					log.Fatal(err)
				}

			case <-lt.stopper.ShouldStop():
				return
			}
		}
	})
}
Esempio n. 4
0
func (lt *localInterceptableTransport) handleMessage(msg *RaftMessageRequest) {
	ack := make(chan struct{})
	iMsg := &interceptMessage{
		args: msg,
		ack:  ack,
	}
	// The following channel ops are not protected by a select with
	// ShouldStop since we are running under a StartTask and leaving
	// things partially complete here could prevent other components
	// from shutting down cleanly.
	lt.Events <- iMsg
	<-ack
	lt.mu.Lock()
	srv, ok := lt.listeners[proto.RaftNodeID(msg.Message.To)]
	lt.mu.Unlock()
	if !ok {
		return
	}
	err := srv.RaftMessage(msg, nil)
	if err == ErrStopped {
		return
	} else if err != nil {
		log.Fatal(err)
	}
}
Esempio n. 5
0
// RaftMessage proxies the incoming request to the listening server interface.
func (t *rpcTransport) RaftMessage(args gogoproto.Message, callback func(gogoproto.Message, error)) {
	protoReq := args.(*proto.RaftMessageRequest)
	// Convert from proto to internal formats.
	req := &multiraft.RaftMessageRequest{GroupID: protoReq.GroupID}
	if err := req.Message.Unmarshal(protoReq.Msg); err != nil {
		callback(nil, err)
		return
	}

	t.mu.Lock()
	server, ok := t.servers[proto.RaftNodeID(req.Message.To)]
	t.mu.Unlock()

	if !ok {
		callback(nil, util.Errorf("Unable to proxy message to node: %d", req.Message.To))
		return
	}

	// Raft responses are empty so we don't actually need to convert
	// between multiraft's internal struct and the external proto
	// representation. In fact, we don't even need to wait for the
	// message to be processed to invoke the callback. We are just
	// (ab)using the async handler mechanism to get this (synchronous)
	// handler called in the RPC server's goroutine so we can preserve
	// order of incoming messages.
	err := server.RaftMessage(req, &multiraft.RaftMessageResponse{})
	callback(&proto.RaftMessageResponse{}, err)
}
Esempio n. 6
0
// newNotLeaderError returns a NotLeaderError intialized with the
// replica for the holder (if any) of the given lease.
func (r *Range) newNotLeaderError(l *proto.Lease) error {
	err := &proto.NotLeaderError{}
	if l != nil && l.RaftNodeID != 0 {
		_, err.Replica = r.Desc().FindReplica(r.rm.StoreID())
		_, storeID := proto.DecodeRaftNodeID(proto.RaftNodeID(l.RaftNodeID))
		_, err.Leader = r.Desc().FindReplica(storeID)
	}
	return err
}
Esempio n. 7
0
// newNotLeaderError returns a NotLeaderError intialized with the
// replica for the holder (if any) of the given lease.
func (r *Replica) newNotLeaderError(l *proto.Lease, originNode proto.RaftNodeID) error {
	err := &proto.NotLeaderError{}
	if l != nil && l.RaftNodeID != 0 {
		_, originStoreID := proto.DecodeRaftNodeID(originNode)
		_, err.Replica = r.Desc().FindReplica(originStoreID)
		_, storeID := proto.DecodeRaftNodeID(proto.RaftNodeID(l.RaftNodeID))
		_, err.Leader = r.Desc().FindReplica(storeID)
	}
	return err
}
Esempio n. 8
0
// fanoutHeartbeat sends the given heartbeat to all groups which believe that
// their leader resides on the sending node.
func (s *state) fanoutHeartbeat(req *RaftMessageRequest) {
	// A heartbeat message is expanded into a heartbeat for each group
	// that the remote node is a part of.
	fromID := proto.RaftNodeID(req.Message.From)
	originNode, ok := s.nodes[fromID]
	if !ok {
		// When a leader considers a follower to be down, it doesn't begin recovery
		// until the follower has successfully responded to a heartbeat. If we get a
		// heartbeat from a node we don't know, it must think we are a follower of
		// some group, so we need to respond so it can activate the recovery process.
		log.Warningf("node %v: not fanning out heartbeat from unknown node %v (but responding anyway)",
			s.nodeID, fromID)
		s.sendMessage(noGroup,
			raftpb.Message{
				From: uint64(s.nodeID),
				To:   req.Message.From,
				Type: raftpb.MsgHeartbeatResp,
			})
		return
	}
	cnt := 0
	for groupID := range originNode.groupIDs {
		// If we don't think that the sending node is leading that group, don't
		// propagate.
		if s.groups[groupID].leader != fromID || fromID == s.nodeID {
			if log.V(8) {
				log.Infof("node %v: not fanning out heartbeat to %v, msg is from %d and leader is %d",
					s.nodeID, req.Message.To, fromID, s.groups[groupID].leader)
			}
			continue
		}
		if err := s.multiNode.Step(context.Background(), uint64(groupID), req.Message); err != nil {
			if log.V(4) {
				log.Infof("node %v: coalesced heartbeat step to group %v failed for message %s", s.nodeID, groupID,
					raft.DescribeMessage(req.Message, s.EntryFormatter))
			}
		}
		cnt++
	}
	if cnt > 0 {
		s.sendMessage(noGroup,
			raftpb.Message{
				From: uint64(s.nodeID),
				To:   req.Message.From,
				Type: raftpb.MsgHeartbeatResp,
			})
	}
	if log.V(7) {
		log.Infof("node %v: received coalesced heartbeat from node %v; "+
			"fanned out to %d followers in %d overlapping groups",
			s.nodeID, fromID, cnt, len(originNode.groupIDs))
	}
}
Esempio n. 9
0
// RaftMessage proxies the incoming request to the listening server interface.
func (t *transportRPCServer) RaftMessage(protoReq *proto.RaftMessageRequest,
	resp *proto.RaftMessageResponse) error {
	// Convert from proto to internal formats.
	req := &multiraft.RaftMessageRequest{GroupID: protoReq.GroupID}
	if err := req.Message.Unmarshal(protoReq.Msg); err != nil {
		return err
	}

	t.mu.Lock()
	server, ok := t.servers[proto.RaftNodeID(req.Message.To)]
	t.mu.Unlock()

	if ok {
		return server.RaftMessage(req, &multiraft.RaftMessageResponse{})
	}

	return util.Errorf("Unable to proxy message to node: %d", req.Message.To)
}
Esempio n. 10
0
// Send a message to the recipient specified in the request.
func (t *rpcTransport) Send(req *multiraft.RaftMessageRequest) error {
	raftNodeID := proto.RaftNodeID(req.Message.To)
	t.mu.Lock()
	ch, ok := t.queues[raftNodeID]
	if !ok {
		ch = make(chan *multiraft.RaftMessageRequest, raftSendBufferSize)
		t.queues[raftNodeID] = ch
		go t.processQueue(raftNodeID)
	}
	t.mu.Unlock()

	select {
	case ch <- req:
	default:
		return util.Errorf("queue for node %d is full", req.Message.To)
	}
	return nil
}
Esempio n. 11
0
// processRaftCommand processes a raft command by unpacking the command
// struct to get args and reply and then applying the command to the
// state machine via applyRaftCommand(). The error result is sent on
// the command's done channel, if available.
func (r *Range) processRaftCommand(idKey cmdIDKey, index uint64, raftCmd proto.InternalRaftCommand) error {
	if index == 0 {
		log.Fatalc(r.context(), "processRaftCommand requires a non-zero index")
	}

	r.Lock()
	cmd := r.pendingCmds[idKey]
	delete(r.pendingCmds, idKey)
	r.Unlock()

	args := raftCmd.Cmd.GetValue().(proto.Request)
	var reply proto.Response
	var ctx context.Context
	if cmd != nil {
		// We initiated this command, so use the caller-supplied reply.
		reply = cmd.Reply
		ctx = cmd.ctx
	} else {
		// This command originated elsewhere so we must create a new reply buffer.
		reply = args.CreateReply()
		// TODO(tschottdorf): consider the Trace situation here.
		ctx = r.context()
	}

	execDone := tracer.FromCtx(ctx).Epoch(fmt.Sprintf("applying %s", args.Method()))
	// applyRaftCommand will return "expected" errors, but may also indicate
	// replica corruption (as of now, signaled by a replicaCorruptionError).
	// We feed its return through maybeSetCorrupt to act when that happens.
	err := r.maybeSetCorrupt(
		r.applyRaftCommand(ctx, index, proto.RaftNodeID(raftCmd.OriginNodeID), args, reply),
	)
	execDone()

	if cmd != nil {
		cmd.done <- err
	} else if err != nil && log.V(1) {
		log.Errorc(r.context(), "error executing raft command %s: %s", args.Method(), err)
	}

	return err
}
Esempio n. 12
0
// sendMessage sends a raft message on the given group. Coalesced heartbeats
// address nodes, not groups; they will use the noGroup constant as groupID.
func (s *state) sendMessage(groupID proto.RaftID, msg raftpb.Message) {
	if log.V(6) {
		log.Infof("node %v sending message %.200s to %v", s.nodeID,
			raft.DescribeMessage(msg, s.EntryFormatter), msg.To)
	}
	nodeID := proto.RaftNodeID(msg.To)
	if _, ok := s.nodes[nodeID]; !ok {
		if log.V(4) {
			log.Infof("node %v: connecting to new node %v", s.nodeID, nodeID)
		}
		var err error
		if groupID != noGroup {
			err = s.addNode(nodeID, groupID)
		} else {
			err = s.addNode(nodeID)
		}
		if err != nil {
			log.Errorf("node %v: error adding group %v to node %v: %v",
				s.nodeID, groupID, nodeID, err)
		}
	}
	err := s.Transport.Send(&RaftMessageRequest{groupID, msg})
	snapStatus := raft.SnapshotFinish
	if err != nil {
		log.Warningf("node %v failed to send message to %v: %s", s.nodeID, nodeID, err)
		if groupID != noGroup {
			s.multiNode.ReportUnreachable(msg.To, uint64(groupID))
		}
		snapStatus = raft.SnapshotFailure
	}
	if msg.Type == raftpb.MsgSnap {
		// TODO(bdarnell): add an ack for snapshots and don't report status until
		// ack, error, or timeout.
		if groupID != noGroup {
			s.multiNode.ReportSnapshot(msg.To, uint64(groupID), snapStatus)
		}
	}
}
Esempio n. 13
0
func newTestCluster(transport Transport, size int, stopper *stop.Stopper, t *testing.T) *testCluster {
	if transport == nil {
		transport = NewLocalRPCTransport(stopper)
	}
	stopper.AddCloser(transport)
	cluster := &testCluster{
		t:         t,
		transport: transport,
		groups:    map[proto.RangeID][]int{},
	}

	for i := 0; i < size; i++ {
		ticker := newManualTicker()
		storage := &BlockableStorage{storage: NewMemoryStorage()}
		config := &Config{
			Transport:              transport,
			Storage:                storage,
			Ticker:                 ticker,
			ElectionTimeoutTicks:   2,
			HeartbeatIntervalTicks: 1,
			TickInterval:           time.Hour, // not in use
		}
		mr, err := NewMultiRaft(proto.RaftNodeID(i+1), config, stopper)
		if err != nil {
			t.Fatal(err)
		}
		state := newState(mr)
		demux := newEventDemux(state.Events)
		demux.start(stopper)
		cluster.nodes = append(cluster.nodes, state)
		cluster.tickers = append(cluster.tickers, ticker)
		cluster.events = append(cluster.events, demux)
		cluster.storages = append(cluster.storages, storage)
	}
	cluster.start()
	return cluster
}
Esempio n. 14
0
func (s *state) removeGroup(op *removeGroupOp, readyGroups map[uint64]raft.Ready) {
	// Group creation is lazy and idempotent; so is removal.
	g, ok := s.groups[op.groupID]
	if !ok {
		op.ch <- nil
		return
	}
	if log.V(3) {
		log.Infof("node %v removing group %v", s.nodeID, op.groupID)
	}

	// Cancel commands which are still in transit.
	for _, prop := range g.pending {
		s.removePending(g, prop, ErrGroupDeleted)
	}

	if err := s.multiNode.RemoveGroup(uint64(op.groupID)); err != nil {
		op.ch <- err
		return
	}
	gs := s.Storage.GroupStorage(op.groupID)
	_, cs, err := gs.InitialState()
	if err != nil {
		op.ch <- err
	}
	for _, nodeID := range cs.Nodes {
		s.nodes[proto.RaftNodeID(nodeID)].unregisterGroup(op.groupID)
	}
	// Delete any entries for this group in readyGroups.
	if readyGroups != nil {
		delete(readyGroups, uint64(op.groupID))
	}

	delete(s.groups, op.groupID)
	op.ch <- nil
}
Esempio n. 15
0
// handleWriteResponse updates the state machine and sends messages for a raft Ready batch.
func (s *state) handleWriteResponse(response *writeResponse, readyGroups map[uint64]raft.Ready) {
	if log.V(6) {
		log.Infof("node %v got write response: %#v", s.nodeID, *response)
	}
	// Everything has been written to disk; now we can apply updates to the state machine
	// and send outgoing messages.
	for groupID, ready := range readyGroups {
		raftGroupID := proto.RaftID(groupID)
		g, ok := s.groups[raftGroupID]
		if !ok {
			if log.V(4) {
				log.Infof("dropping stale write to group %v", groupID)
			}
			continue
		} else if !g.writing {
			if log.V(4) {
				log.Infof("dropping stale write to reincarnation of group %v", groupID)
			}
			delete(readyGroups, groupID) // they must not make it to Advance.
			continue
		}
		g.writing = false

		// Process committed entries.
		for _, entry := range ready.CommittedEntries {
			commandID := s.processCommittedEntry(raftGroupID, g, entry)
			// TODO(bdarnell): the command is now committed, but not applied until the
			// application consumes EventCommandCommitted. Is returning via the channel
			// at this point useful or do we need to wait for the command to be
			// applied too?
			// This could be done with a Callback as in EventMembershipChangeCommitted
			// or perhaps we should move away from a channel to a callback-based system.
			s.removePending(g, g.pending[commandID], nil /* err */)
		}

		if !raft.IsEmptySnap(ready.Snapshot) {
			// Sync the group/node mapping with the information contained in the snapshot.
			for _, nodeID := range ready.Snapshot.Metadata.ConfState.Nodes {
				// TODO(bdarnell): if we had any information that predated this snapshot
				// we must remove those nodes.
				if err := s.addNode(proto.RaftNodeID(nodeID), raftGroupID); err != nil {
					log.Errorf("node %v: error adding node %v", s.nodeID, nodeID)
				}
			}
		}

		// Process SoftState and leader changes.
		s.maybeSendLeaderEvent(raftGroupID, g, &ready)

		// Send all messages.
		for _, msg := range ready.Messages {
			switch msg.Type {
			case raftpb.MsgHeartbeat:
				if log.V(8) {
					log.Infof("node %v dropped individual heartbeat to node %v",
						s.nodeID, msg.To)
				}
			case raftpb.MsgHeartbeatResp:
				if log.V(8) {
					log.Infof("node %v dropped individual heartbeat response to node %v",
						s.nodeID, msg.To)
				}
			default:
				s.sendMessage(raftGroupID, msg)
			}
		}
	}
}
Esempio n. 16
0
// processCommittedEntry tells the application that a command was committed.
// Returns the commandID, or an empty string if the given entry was not a command.
func (s *state) processCommittedEntry(groupID proto.RaftID, g *group, entry raftpb.Entry) string {
	var commandID string
	switch entry.Type {
	case raftpb.EntryNormal:
		// etcd raft occasionally adds a nil entry (e.g. upon election); ignore these.
		if entry.Data != nil {
			var command []byte
			commandID, command = decodeCommand(entry.Data)
			s.sendEvent(&EventCommandCommitted{
				GroupID:   groupID,
				CommandID: commandID,
				Command:   command,
				Index:     entry.Index,
			})
		}

	case raftpb.EntryConfChange:
		cc := raftpb.ConfChange{}
		if err := cc.Unmarshal(entry.Data); err != nil {
			log.Fatalf("invalid ConfChange data: %s", err)
		}
		var payload []byte
		if len(cc.Context) > 0 {
			commandID, payload = decodeCommand(cc.Context)
		}
		s.sendEvent(&EventMembershipChangeCommitted{
			GroupID:    groupID,
			CommandID:  commandID,
			Index:      entry.Index,
			NodeID:     proto.RaftNodeID(cc.NodeID),
			ChangeType: cc.Type,
			Payload:    payload,
			Callback: func(err error) {
				select {
				case s.callbackChan <- func() {
					if err == nil {
						if log.V(3) {
							log.Infof("node %v applying configuration change %v", s.nodeID, cc)
						}
						// TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs
						switch cc.Type {
						case raftpb.ConfChangeAddNode:
							err = s.addNode(proto.RaftNodeID(cc.NodeID), proto.RaftID(groupID))
						case raftpb.ConfChangeRemoveNode:
							// TODO(bdarnell): support removing nodes; fix double-application of initial entries
						case raftpb.ConfChangeUpdateNode:
							// Updates don't concern multiraft, they are simply passed through.
						}
						if err != nil {
							log.Errorf("error applying configuration change %v: %s", cc, err)
						}
						s.multiNode.ApplyConfChange(uint64(groupID), cc)
					} else {
						log.Warningf("aborting configuration change: %s", err)
						s.multiNode.ApplyConfChange(uint64(groupID),
							raftpb.ConfChange{})
					}

					// Re-submit all pending proposals, in case any of them were config changes
					// that were dropped due to the one-at-a-time rule. This is a little
					// redundant since most pending proposals won't benefit from this but
					// config changes should be rare enough (and the size of the pending queue
					// small enough) that it doesn't really matter.
					for _, prop := range g.pending {
						s.propose(prop)
					}
				}:
				case <-s.stopper.ShouldStop():
				}
			},
		})
	}
	return commandID
}
Esempio n. 17
0
func (s *state) createGroup(groupID proto.RaftID) error {
	if _, ok := s.groups[groupID]; ok {
		return nil
	}
	if log.V(3) {
		log.Infof("node %v creating group %v", s.nodeID, groupID)
	}

	gs := s.Storage.GroupStorage(groupID)
	_, cs, err := gs.InitialState()
	if err != nil {
		return err
	}

	var appliedIndex uint64
	if s.StateMachine != nil {
		appliedIndex, err = s.StateMachine.AppliedIndex(groupID)
		if err != nil {
			return err
		}
	}

	raftCfg := &raft.Config{
		Applied:       appliedIndex,
		ElectionTick:  s.ElectionTimeoutTicks,
		HeartbeatTick: s.HeartbeatIntervalTicks,
		Storage:       gs,
		// TODO(bdarnell): make these configurable; evaluate defaults.
		MaxSizePerMsg:   1024 * 1024,
		MaxInflightMsgs: 256,
	}
	if err := s.multiNode.CreateGroup(uint64(groupID), raftCfg, nil); err != nil {
		return err
	}
	s.groups[groupID] = &group{
		pending: map[string]*proposal{},
	}

	for _, nodeID := range cs.Nodes {
		if err := s.addNode(proto.RaftNodeID(nodeID), groupID); err != nil {
			return err
		}
	}

	// Automatically campaign and elect a leader for this group if there's
	// exactly one known node for this group.
	//
	// A grey area for this being correct happens in the case when we're
	// currently in the progress of adding a second node to the group,
	// with the change committed but not applied.
	// Upon restarting, the node would immediately elect itself and only
	// then apply the config change, where really it should be applying
	// first and then waiting for the majority (which would now require
	// two votes, not only its own).
	// However, in that special case, the second node has no chance to
	// be elected master while this node restarts (as it's aware of the
	// configuration and knows it needs two votes), so the worst that
	// could happen is both nodes ending up in candidate state, timing
	// out and then voting again. This is expected to be an extremely
	// rare event.
	if len(cs.Nodes) == 1 && s.MultiRaft.nodeID == proto.RaftNodeID(cs.Nodes[0]) {
		return s.multiNode.Campaign(context.Background(), uint64(groupID))
	}
	return nil
}
Esempio n. 18
0
func (s *state) start() {
	s.stopper.RunWorker(func() {
		defer func() {
			if log.V(6) {
				log.Infof("node %v: stopping", s.nodeID)
			}
			s.stop()
		}()
		if log.V(1) {
			log.Infof("node %v starting", s.nodeID)
		}
		s.writeTask.start(s.stopper)
		// These maps form a kind of state machine: We don't want to read from the
		// ready channel until the groups we got from the last read have made their
		// way through the rest of the pipeline.
		var readyGroups map[uint64]raft.Ready
		var writingGroups map[uint64]raft.Ready
		// Counts up to heartbeat interval and is then reset.
		ticks := 0
		for {
			// raftReady signals that the Raft state machine has pending
			// work. That work is supplied over the raftReady channel as a map
			// from group ID to raft.Ready struct.
			var raftReady <-chan map[uint64]raft.Ready
			// writeReady is set to the write task's ready channel, which
			// receives when the write task is prepared to persist ready data
			// from the Raft state machine.
			// The writeReady mechanism is currently disabled as we are testing
			// performing all writes synchronously.
			// TODO(bdarnell): either reinstate writeReady or rip it out completely.
			//var writeReady chan struct{}

			// The order of operations in this loop structure is as follows:
			// start by setting raftReady to the multiNode's Ready()
			// channel. Once a new raftReady has been consumed from the
			// channel, set writeReady to the write task's ready channel and
			// set raftReady back to nil. This advances our read-from-raft /
			// write-to-storage state machine to the next step: wait for the
			// write task to be ready to persist the new data.
			if readyGroups != nil {
				//writeReady = s.writeTask.ready
			} else if writingGroups == nil {
				raftReady = s.multiNode.Ready()
			}

			if log.V(8) {
				log.Infof("node %v: selecting", s.nodeID)
			}
			select {
			case <-s.stopper.ShouldStop():
				return

			case req := <-s.reqChan:
				if log.V(5) {
					log.Infof("node %v: group %v got message %.200s", s.nodeID, req.GroupID,
						raft.DescribeMessage(req.Message, s.EntryFormatter))
				}
				switch req.Message.Type {
				case raftpb.MsgHeartbeat:
					s.fanoutHeartbeat(req)
				case raftpb.MsgHeartbeatResp:
					s.fanoutHeartbeatResponse(proto.RaftNodeID(req.Message.From))
				default:
					// We only want to lazily create the group if it's not heartbeat-related;
					// our heartbeats are coalesced and contain a dummy GroupID.
					// TODO(tschottdorf) still shouldn't hurt to move this part outside,
					// but suddenly tests will start failing. Should investigate.
					if _, ok := s.groups[req.GroupID]; !ok {
						log.Infof("node %v: got message for unknown group %d; creating it", s.nodeID, req.GroupID)
						if err := s.createGroup(req.GroupID); err != nil {
							log.Warningf("Error creating group %d: %s", req.GroupID, err)
							break
						}
					}

					if err := s.multiNode.Step(context.Background(), uint64(req.GroupID), req.Message); err != nil {
						if log.V(4) {
							log.Infof("node %v: multinode step to group %v failed for message %.200s", s.nodeID, req.GroupID,
								raft.DescribeMessage(req.Message, s.EntryFormatter))
						}
					}
				}
			case op := <-s.createGroupChan:
				if log.V(6) {
					log.Infof("node %v: got op %#v", s.nodeID, op)
				}
				op.ch <- s.createGroup(op.groupID)

			case op := <-s.removeGroupChan:
				if log.V(6) {
					log.Infof("node %v: got op %#v", s.nodeID, op)
				}
				s.removeGroup(op, readyGroups)

			case prop := <-s.proposalChan:
				s.propose(prop)

			case readyGroups = <-raftReady:
				// readyGroups are saved in a local variable until they can be sent to
				// the write task (and then the real work happens after the write is
				// complete). All we do for now is log them.
				s.logRaftReady(readyGroups)

				select {
				case s.writeTask.ready <- struct{}{}:
				case <-s.stopper.ShouldStop():
					return
				}
				s.handleWriteReady(readyGroups)
				writingGroups = readyGroups
				readyGroups = nil

				select {
				case resp := <-s.writeTask.out:
					s.handleWriteResponse(resp, writingGroups)
					s.multiNode.Advance(writingGroups)
					writingGroups = nil
				case <-s.stopper.ShouldStop():
					return
				}

			case <-s.Ticker.Chan():
				if log.V(8) {
					log.Infof("node %v: got tick", s.nodeID)
				}
				s.multiNode.Tick()
				ticks++
				if ticks >= s.HeartbeatIntervalTicks {
					ticks = 0
					s.coalescedHeartbeat()
				}

			case cb := <-s.callbackChan:
				cb()
			}
		}
	})
}
Esempio n. 19
0
// processCommittedEntry tells the application that a command was committed.
// Returns the commandID, or an empty string if the given entry was not a command.
func (s *state) processCommittedEntry(groupID proto.RangeID, g *group, entry raftpb.Entry) string {
	var commandID string
	switch entry.Type {
	case raftpb.EntryNormal:
		// etcd raft occasionally adds a nil entry (e.g. upon election); ignore these.
		if entry.Data != nil {
			var command []byte
			commandID, command = decodeCommand(entry.Data)
			s.sendEvent(&EventCommandCommitted{
				GroupID:   groupID,
				CommandID: commandID,
				Command:   command,
				Index:     entry.Index,
			})
		}

	case raftpb.EntryConfChange:
		cc := raftpb.ConfChange{}
		if err := cc.Unmarshal(entry.Data); err != nil {
			log.Fatalf("invalid ConfChange data: %s", err)
		}
		var payload []byte
		if len(cc.Context) > 0 {
			commandID, payload = decodeCommand(cc.Context)
		}
		g.waitForCallback = true
		s.sendEvent(&EventMembershipChangeCommitted{
			GroupID:    groupID,
			CommandID:  commandID,
			Index:      entry.Index,
			NodeID:     proto.RaftNodeID(cc.NodeID),
			ChangeType: cc.Type,
			Payload:    payload,
			Callback: func(err error) {
				select {
				case s.callbackChan <- func() {
					if err == nil {
						if log.V(3) {
							log.Infof("node %v applying configuration change %v", s.nodeID, cc)
						}
						// TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs
						switch cc.Type {
						case raftpb.ConfChangeAddNode:
							err = s.addNode(proto.RaftNodeID(cc.NodeID), g)
						case raftpb.ConfChangeRemoveNode:
							err = s.removeNode(proto.RaftNodeID(cc.NodeID), g)
						case raftpb.ConfChangeUpdateNode:
							// Updates don't concern multiraft, they are simply passed through.
						}
						if err != nil {
							log.Errorf("error applying configuration change %v: %s", cc, err)
						}
						s.multiNode.ApplyConfChange(uint64(groupID), cc)
					} else {
						log.Warningf("aborting configuration change: %s", err)
						s.multiNode.ApplyConfChange(uint64(groupID),
							raftpb.ConfChange{})
					}

					// Re-submit all pending proposals that were held
					// while the config change was pending
					g.waitForCallback = false
					for _, prop := range g.pending {
						s.propose(prop)
					}
				}:
				case <-s.stopper.ShouldStop():
				}
			},
		})
	}
	return commandID
}