Beispiel #1
0
func logRaftReady(storeID roachpb.StoreID, groupID roachpb.RangeID, ready raft.Ready) {
	if log.V(5) {
		// Globally synchronize to avoid interleaving different sets of logs in tests.
		logRaftReadyMu.Lock()
		defer logRaftReadyMu.Unlock()
		log.Infof("store %s: group %s raft ready", storeID, groupID)
		if ready.SoftState != nil {
			log.Infof("SoftState updated: %+v", *ready.SoftState)
		}
		if !raft.IsEmptyHardState(ready.HardState) {
			log.Infof("HardState updated: %+v", ready.HardState)
		}
		for i, e := range ready.Entries {
			log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, raftEntryFormatter))
		}
		for i, e := range ready.CommittedEntries {
			log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, raftEntryFormatter))
		}
		if !raft.IsEmptySnap(ready.Snapshot) {
			log.Infof("Snapshot updated: %.200s", ready.Snapshot.String())
		}
		for i, m := range ready.Messages {
			log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, raftEntryFormatter))
		}
	}
}
Beispiel #2
0
func (s *state) logRaftReady(readyGroups map[uint64]raft.Ready) {
	for groupID, ready := range readyGroups {
		if log.V(5) {
			log.Infof("node %v: group %v raft ready", s.nodeID, groupID)
			if ready.SoftState != nil {
				log.Infof("SoftState updated: %+v", *ready.SoftState)
			}
			if !raft.IsEmptyHardState(ready.HardState) {
				log.Infof("HardState updated: %+v", ready.HardState)
			}
			for i, e := range ready.Entries {
				log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter))
			}
			for i, e := range ready.CommittedEntries {
				log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter))
			}
			if !raft.IsEmptySnap(ready.Snapshot) {
				log.Infof("Snapshot updated: %.200s", ready.Snapshot.String())
			}
			for i, m := range ready.Messages {
				log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, s.EntryFormatter))
			}
		}
	}
}
Beispiel #3
0
// fanoutHeartbeatResponse sends the given heartbeat response to all groups
// which overlap with the sender's groups and consider themselves leader.
func (s *state) fanoutHeartbeatResponse(req *RaftMessageRequest) {
	fromID := NodeID(req.Message.From)
	originNode, ok := s.nodes[fromID]
	if !ok {
		log.Warningf("node %v: not fanning out heartbeat response from unknown node %v",
			s.nodeID, fromID)
		return
	}
	cnt := 0
	for groupID := range originNode.groupIDs {
		// If we don't think that the local node is leader, don't propagate.
		if s.groups[groupID].leader != s.nodeID || fromID == s.nodeID {
			log.V(8).Infof("node %v: not fanning out heartbeat response to %v, msg is from %d and leader is %d",
				s.nodeID, req.Message.To, fromID, s.groups[groupID].leader)
			continue
		}
		if err := s.multiNode.Step(context.Background(), groupID, req.Message); err != nil {
			log.V(4).Infof("node %v: coalesced heartbeat response step failed for message %s", s.nodeID, groupID,
				raft.DescribeMessage(req.Message, s.EntryFormatter))
		}
		cnt++
	}
	log.V(7).Infof("node %v: received coalesced heartbeat response from node %v; "+
		"fanned out to %d leaders in %d overlapping groups",
		s.nodeID, fromID, cnt, len(originNode.groupIDs))
}
Beispiel #4
0
func logRaftReady(ctx context.Context, prefix fmt.Stringer, ready raft.Ready) {
	if log.V(5) {
		var buf bytes.Buffer
		if ready.SoftState != nil {
			fmt.Fprintf(&buf, "  SoftState updated: %+v\n", *ready.SoftState)
		}
		if !raft.IsEmptyHardState(ready.HardState) {
			fmt.Fprintf(&buf, "  HardState updated: %+v\n", ready.HardState)
		}
		for i, e := range ready.Entries {
			fmt.Fprintf(&buf, "  New Entry[%d]: %.200s\n",
				i, raft.DescribeEntry(e, raftEntryFormatter))
		}
		for i, e := range ready.CommittedEntries {
			fmt.Fprintf(&buf, "  Committed Entry[%d]: %.200s\n",
				i, raft.DescribeEntry(e, raftEntryFormatter))
		}
		if !raft.IsEmptySnap(ready.Snapshot) {
			fmt.Fprintf(&buf, "  Snapshot updated: %.200s\n", ready.Snapshot.String())
		}
		for i, m := range ready.Messages {
			fmt.Fprintf(&buf, "  Outgoing Message[%d]: %.200s\n",
				i, raft.DescribeMessage(m, raftEntryFormatter))
		}
		log.Infof(ctx, "%s raft ready\n%s", prefix, buf.String())
	}
}
Beispiel #5
0
func (n *node) send(messages []raftpb.Message) {
	for _, m := range messages {
		log.Println(raft.DescribeMessage(m, nil))

		// send message to other node
		//nodes[int(m.To)].receive(n.ctx, m)
		go sendOverNetwork(n.ctx, m)
	}
}
Beispiel #6
0
func (s *state) handleRaftReady(readyGroups map[uint64]raft.Ready) {
	// Soft state is updated immediately; everything else waits for handleWriteReady.
	for groupID, ready := range readyGroups {
		if log.V(5) {
			log.Infof("node %v: group %v raft ready", s.nodeID, groupID)
			if ready.SoftState != nil {
				log.Infof("SoftState updated: %+v", *ready.SoftState)
			}
			if !raft.IsEmptyHardState(ready.HardState) {
				log.Infof("HardState updated: %+v", ready.HardState)
			}
			for i, e := range ready.Entries {
				log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter))
			}
			for i, e := range ready.CommittedEntries {
				log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter))
			}
			if !raft.IsEmptySnap(ready.Snapshot) {
				log.Infof("Snapshot updated: %.200s", ready.Snapshot.String())
			}
			for i, m := range ready.Messages {
				log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, s.EntryFormatter))
			}
		}

		g, ok := s.groups[groupID]
		if !ok {
			// This is a stale message for a removed group
			log.V(4).Infof("node %v: dropping stale ready message for group %v", s.nodeID, groupID)
			continue
		}
		term := g.committedTerm
		if ready.SoftState != nil {
			// Always save the leader whenever we get a SoftState.
			g.leader = NodeID(ready.SoftState.Lead)
		}
		if len(ready.CommittedEntries) > 0 {
			term = ready.CommittedEntries[len(ready.CommittedEntries)-1].Term
		}
		if term != g.committedTerm && g.leader != 0 {
			// Whenever the committed term has advanced and we know our leader,
			// emit an event.
			g.committedTerm = term
			s.sendEvent(&EventLeaderElection{
				GroupID: groupID,
				NodeID:  NodeID(g.leader),
				Term:    g.committedTerm,
			})

			// Re-submit all pending proposals
			for _, prop := range g.pending {
				s.proposalChan <- prop
			}
		}
	}
}
Beispiel #7
0
// send messages to peers.
func (n *node) send(messages []raftpb.Message) {
	for _, m := range messages {
		log.Println("SEND: ", raft.DescribeMessage(m, nil))

		nodes[int(m.To)].receive(context.TODO(), m)
		if m.Type == raftpb.MsgSnap {
			// 	n.raft.ReportSnapshot(m.To, raft.SnapshotFinish)
		}
	}
}
Beispiel #8
0
func (s *state) handleMessage(req *RaftMessageRequest) {
	// We only want to lazily create the group if it's not heartbeat-related;
	// our heartbeats are coalesced and contain a dummy GroupID.
	switch req.Message.Type {
	case raftpb.MsgHeartbeat:
		s.fanoutHeartbeat(req)
		return
	case raftpb.MsgHeartbeatResp:
		s.fanoutHeartbeatResponse(req)
		return
	}

	s.CacheReplicaDescriptor(req.GroupID, req.FromReplica)
	s.CacheReplicaDescriptor(req.GroupID, req.ToReplica)
	if g, ok := s.groups[req.GroupID]; ok {
		if g.replicaID > req.ToReplica.ReplicaID {
			log.Warningf("node %v: got message for group %s with stale replica ID %s (expected %s)",
				s.nodeID, req.GroupID, req.ToReplica.ReplicaID, g.replicaID)
			return
		} else if g.replicaID < req.ToReplica.ReplicaID {
			// The message has a newer ReplicaID than we know about. This
			// means that this node has been removed from a group and
			// re-added to it, before our GC process was able to remove the
			// remnants of the old group.
			log.Infof("node %v: got message for group %s with newer replica ID (%s vs %s), recreating group",
				s.nodeID, req.GroupID, req.ToReplica.ReplicaID, g.replicaID)
			if err := s.removeGroup(req.GroupID); err != nil {
				log.Warningf("Error removing group %d (in response to incoming message): %s",
					req.GroupID, err)
				return
			}
			if err := s.createGroup(req.GroupID, req.ToReplica.ReplicaID); err != nil {
				log.Warningf("Error recreating group %d (in response to incoming message): %s",
					req.GroupID, err)
				return
			}
		}
	} else {
		if log.V(1) {
			log.Infof("node %v: got message for unknown group %d; creating it", s.nodeID, req.GroupID)
		}
		if err := s.createGroup(req.GroupID, req.ToReplica.ReplicaID); err != nil {
			log.Warningf("Error creating group %d (in response to incoming message): %s",
				req.GroupID, err)
			return
		}
	}

	if err := s.multiNode.Step(context.Background(), uint64(req.GroupID), req.Message); err != nil {
		if log.V(4) {
			log.Infof("node %v: multinode step to group %v failed for message %.200s", s.nodeID, req.GroupID,
				raft.DescribeMessage(req.Message, s.EntryFormatter))
		}
	}
}
Beispiel #9
0
// fanoutHeartbeat sends the given heartbeat to all groups which believe that
// their leader resides on the sending node.
func (s *state) fanoutHeartbeat(req *RaftMessageRequest) {
	// A heartbeat message is expanded into a heartbeat for each group
	// that the remote node is a part of.
	fromID := proto.RaftNodeID(req.Message.From)
	originNode, ok := s.nodes[fromID]
	if !ok {
		// When a leader considers a follower to be down, it doesn't begin recovery
		// until the follower has successfully responded to a heartbeat. If we get a
		// heartbeat from a node we don't know, it must think we are a follower of
		// some group, so we need to respond so it can activate the recovery process.
		log.Warningf("node %v: not fanning out heartbeat from unknown node %v (but responding anyway)",
			s.nodeID, fromID)
		s.sendMessage(noGroup,
			raftpb.Message{
				From: uint64(s.nodeID),
				To:   req.Message.From,
				Type: raftpb.MsgHeartbeatResp,
			})
		return
	}
	cnt := 0
	for groupID := range originNode.groupIDs {
		// If we don't think that the sending node is leading that group, don't
		// propagate.
		if s.groups[groupID].leader != fromID || fromID == s.nodeID {
			if log.V(8) {
				log.Infof("node %v: not fanning out heartbeat to %v, msg is from %d and leader is %d",
					s.nodeID, req.Message.To, fromID, s.groups[groupID].leader)
			}
			continue
		}
		if err := s.multiNode.Step(context.Background(), uint64(groupID), req.Message); err != nil {
			if log.V(4) {
				log.Infof("node %v: coalesced heartbeat step to group %v failed for message %s", s.nodeID, groupID,
					raft.DescribeMessage(req.Message, s.EntryFormatter))
			}
		}
		cnt++
	}
	if cnt > 0 {
		s.sendMessage(noGroup,
			raftpb.Message{
				From: uint64(s.nodeID),
				To:   req.Message.From,
				Type: raftpb.MsgHeartbeatResp,
			})
	}
	if log.V(7) {
		log.Infof("node %v: received coalesced heartbeat from node %v; "+
			"fanned out to %d followers in %d overlapping groups",
			s.nodeID, fromID, cnt, len(originNode.groupIDs))
	}
}
Beispiel #10
0
// sendMessage sends a raft message on the given group. Coalesced heartbeats
// address nodes, not groups; they will use the noGroup constant as groupID.
func (s *state) sendMessage(groupID proto.RaftID, msg raftpb.Message) {
	if log.V(6) {
		log.Infof("node %v sending message %.200s to %v", s.nodeID,
			raft.DescribeMessage(msg, s.EntryFormatter), msg.To)
	}
	nodeID := proto.RaftNodeID(msg.To)
	if _, ok := s.nodes[nodeID]; !ok {
		if log.V(4) {
			log.Infof("node %v: connecting to new node %v", s.nodeID, nodeID)
		}
		var err error
		if groupID != noGroup {
			err = s.addNode(nodeID, groupID)
		} else {
			err = s.addNode(nodeID)
		}
		if err != nil {
			log.Errorf("node %v: error adding group %v to node %v: %v",
				s.nodeID, groupID, nodeID, err)
		}
	}
	err := s.Transport.Send(&RaftMessageRequest{groupID, msg})
	snapStatus := raft.SnapshotFinish
	if err != nil {
		log.Warningf("node %v failed to send message to %v: %s", s.nodeID, nodeID, err)
		if groupID != noGroup {
			s.multiNode.ReportUnreachable(msg.To, uint64(groupID))
		}
		snapStatus = raft.SnapshotFailure
	}
	if msg.Type == raftpb.MsgSnap {
		// TODO(bdarnell): add an ack for snapshots and don't report status until
		// ack, error, or timeout.
		if groupID != noGroup {
			s.multiNode.ReportSnapshot(msg.To, uint64(groupID), snapStatus)
		}
	}
}
Beispiel #11
0
func (s *state) start() {
	log.V(1).Infof("node %v starting", s.nodeID)
	go s.writeTask.start()
	// These maps form a kind of state machine: We don't want to read from the
	// ready channel until the groups we got from the last read have made their
	// way through the rest of the pipeline.
	var readyGroups map[uint64]raft.Ready
	var writingGroups map[uint64]raft.Ready
	// Counts up to heartbeat interval and is then reset.
	ticks := 0
	for {
		// raftReady signals that the Raft state machine has pending
		// work. That work is supplied over the raftReady channel as a map
		// from group ID to raft.Ready struct.
		var raftReady <-chan map[uint64]raft.Ready
		// writeReady is set to the write task's ready channel, which
		// receives when the write task is prepared to persist ready data
		// from the Raft state machine.
		var writeReady chan struct{}

		// The order of operations in this loop structure is as follows:
		// start by setting raftReady to the multiNode's Ready()
		// channel. Once a new raftReady has been consumed from the
		// channel, set writeReady to the write task's ready channel and
		// set raftReady back to nil. This advances our read-from-raft /
		// write-to-storage state machine to the next step: wait for the
		// write task to be ready to persist the new data.
		if readyGroups != nil {
			writeReady = s.writeTask.ready
		} else if writingGroups == nil {
			raftReady = s.multiNode.Ready()
		}

		log.V(8).Infof("node %v: selecting", s.nodeID)
		select {
		case <-s.stopper.ShouldStop():
			log.V(6).Infof("node %v: stopping", s.nodeID)
			s.stop()
			return

		case req := <-s.reqChan:
			log.V(5).Infof("node %v: group %v got message %.200s", s.nodeID, req.GroupID,
				raft.DescribeMessage(req.Message, s.EntryFormatter))
			switch req.Message.Type {
			case raftpb.MsgHeartbeat:
				s.fanoutHeartbeat(req)
			case raftpb.MsgHeartbeatResp:
				s.fanoutHeartbeatResponse(req)
			default:
				if _, ok := s.groups[req.GroupID]; !ok {
					log.Infof("node %v: got message for unknown group %d; creating it", s.nodeID, req.GroupID)
					if err := s.createGroup(req.GroupID); err != nil {
						log.Warningf("Error creating group %d: %s", req.GroupID, err)
						break
					}
				}

				if err := s.multiNode.Step(context.Background(), req.GroupID, req.Message); err != nil {
					log.V(4).Infof("node %v: multinode step failed for message %s", s.nodeID, req.GroupID,
						raft.DescribeMessage(req.Message, s.EntryFormatter))
				}
			}
		case op := <-s.createGroupChan:
			log.V(6).Infof("node %v: got op %#v", s.nodeID, op)
			op.ch <- s.createGroup(op.groupID)

		case op := <-s.removeGroupChan:
			log.V(6).Infof("node %v: got op %#v", s.nodeID, op)
			s.removeGroup(op)

		case prop := <-s.proposalChan:
			s.propose(prop)

		case readyGroups = <-raftReady:
			s.handleRaftReady(readyGroups)

		case writeReady <- struct{}{}:
			s.handleWriteReady(readyGroups)
			writingGroups = readyGroups
			readyGroups = nil

		case resp := <-s.writeTask.out:
			s.handleWriteResponse(resp, writingGroups)
			s.multiNode.Advance(writingGroups)
			writingGroups = nil

		case <-s.Ticker.Chan():
			log.V(8).Infof("node %v: got tick", s.nodeID)
			s.multiNode.Tick()
			ticks++
			if ticks >= s.HeartbeatIntervalTicks {
				ticks = 0
				s.coalescedHeartbeat()
			}

		case cb := <-s.callbackChan:
			cb()
		}
	}
}
Beispiel #12
0
func (s *state) start() {
	s.stopper.RunWorker(func() {
		defer s.stop()
		if log.V(1) {
			log.Infof("node %v starting", s.nodeID)
		}
		s.writeTask.start(s.stopper)
		// Counts up to heartbeat interval and is then reset.
		ticks := 0
		// checkReadyGroupIDs keeps track of all the groupIDs which
		// should be checked if there is any pending Ready for that group.
		checkReadyGroupIDs := map[roachpb.RangeID]struct{}{}
		for {
			// writeReady is set to the write task's ready channel, which
			// receives when the write task is prepared to persist ready data
			// from the Raft state machine.
			var writeReady chan struct{}
			var eventsChan chan []interface{}

			// If there is pending check, then check if writeTask is ready to do its work.
			if len(checkReadyGroupIDs) > 0 {
				writeReady = s.writeTask.ready
			}

			// If there is any pending events, then check the s.Events to see
			// if it's free to accept pending events.
			if len(s.pendingEvents) > 0 {
				eventsChan = s.Events
			}

			if log.V(8) {
				log.Infof("node %v: selecting", s.nodeID)
			}
			select {
			case <-s.stopper.ShouldStop():
				return

			case req := <-s.reqChan:
				if log.V(5) {
					log.Infof("node %v: group %v got message %.200s", s.nodeID, req.GroupID,
						raft.DescribeMessage(req.Message, s.EntryFormatter))
				}
				s.handleMessage(req)
				checkReadyGroupIDs[req.GroupID] = struct{}{}

			case op := <-s.createGroupChan:
				if log.V(6) {
					log.Infof("node %v: got op %#v", s.nodeID, op)
				}
				op.ch <- s.createGroup(op.groupID, 0)
				checkReadyGroupIDs[op.groupID] = struct{}{}

			case op := <-s.removeGroupChan:
				if log.V(6) {
					log.Infof("node %v: got op %#v", s.nodeID, op)
				}
				op.ch <- s.removeGroup(op.groupID, op.replicaID)

			case prop := <-s.proposalChan:
				s.propose(prop)
				checkReadyGroupIDs[prop.groupID] = struct{}{}

			case groupID := <-s.campaignChan:
				if log.V(6) {
					log.Infof("node %v: got campaign for group %#v", s.nodeID, groupID)
				}
				if _, ok := s.groups[groupID]; !ok {
					if err := s.createGroup(groupID, 0); err != nil {
						log.Warningf("node %s failed to create group %s during MultiRaft.Campaign: %s",
							s.nodeID, groupID, err)
						continue
					}
				}
				if err := s.groups[groupID].raftGroup.Campaign(); err != nil {
					log.Warningf("node %s failed to campaign for group %s: %s", s.nodeID, groupID, err)
				}
				checkReadyGroupIDs[groupID] = struct{}{}

			case writeReady <- struct{}{}:
				writingGroups := s.handleWriteReady(checkReadyGroupIDs)
				checkReadyGroupIDs = map[roachpb.RangeID]struct{}{}
				// No Ready for any group
				if len(writingGroups) == 0 {
					break
				}
				s.logRaftReady(writingGroups)

				select {
				case resp := <-s.writeTask.out:
					s.handleWriteResponse(resp, writingGroups)
					// Advance rd and check these groups again.
					// handleWriteResponse may call s.proposal to generate new Ready.
					for groupID, rd := range writingGroups {
						if g, ok := s.groups[groupID]; ok {
							g.raftGroup.Advance(rd)
							checkReadyGroupIDs[groupID] = struct{}{}
						}
					}
				case <-s.stopper.ShouldStop():
					return
				}

			case <-s.Ticker.Chan():
				if log.V(8) {
					log.Infof("node %v: got tick", s.nodeID)
				}
				for groupID, g := range s.groups {
					g.raftGroup.Tick()
					checkReadyGroupIDs[groupID] = struct{}{}
				}
				ticks++
				if ticks >= s.HeartbeatIntervalTicks {
					ticks = 0
					s.coalescedHeartbeat()
				}

			case cb := <-s.callbackChan:
				if log.V(8) {
					log.Infof("node %v: got callback", s.nodeID)
				}
				cb()

			case eventsChan <- s.pendingEvents:
				if log.V(8) {
					log.Infof("node %v: send pendingEvents len %d", s.nodeID, len(s.pendingEvents))
				}
				s.pendingEvents = nil

			case statusOp := <-s.statusChan:
				if log.V(8) {
					log.Infof("node %v: receive status groupID %d", s.nodeID, statusOp.groupID)
				}
				g := s.groups[statusOp.groupID]
				if g == nil {
					log.Infof("node %v: g is nil for statusOp for groupID %v", s.nodeID, statusOp.groupID)
					statusOp.ch <- nil
					break
				}
				statusOp.ch <- g.raftGroup.Status()
			}
		}
	})
}
Beispiel #13
0
func (s *state) start() {
	s.stopper.RunWorker(func() {
		defer func() {
			if log.V(6) {
				log.Infof("node %v: stopping", s.nodeID)
			}
			s.stop()
		}()
		if log.V(1) {
			log.Infof("node %v starting", s.nodeID)
		}
		s.writeTask.start(s.stopper)
		// These maps form a kind of state machine: We don't want to read from the
		// ready channel until the groups we got from the last read have made their
		// way through the rest of the pipeline.
		var readyGroups map[uint64]raft.Ready
		var writingGroups map[uint64]raft.Ready
		// Counts up to heartbeat interval and is then reset.
		ticks := 0
		for {
			// raftReady signals that the Raft state machine has pending
			// work. That work is supplied over the raftReady channel as a map
			// from group ID to raft.Ready struct.
			var raftReady <-chan map[uint64]raft.Ready
			// writeReady is set to the write task's ready channel, which
			// receives when the write task is prepared to persist ready data
			// from the Raft state machine.
			// The writeReady mechanism is currently disabled as we are testing
			// performing all writes synchronously.
			// TODO(bdarnell): either reinstate writeReady or rip it out completely.
			//var writeReady chan struct{}

			// The order of operations in this loop structure is as follows:
			// start by setting raftReady to the multiNode's Ready()
			// channel. Once a new raftReady has been consumed from the
			// channel, set writeReady to the write task's ready channel and
			// set raftReady back to nil. This advances our read-from-raft /
			// write-to-storage state machine to the next step: wait for the
			// write task to be ready to persist the new data.
			if readyGroups != nil {
				//writeReady = s.writeTask.ready
			} else if writingGroups == nil {
				raftReady = s.multiNode.Ready()
			}

			if log.V(8) {
				log.Infof("node %v: selecting", s.nodeID)
			}
			select {
			case <-s.stopper.ShouldStop():
				return

			case req := <-s.reqChan:
				if log.V(5) {
					log.Infof("node %v: group %v got message %.200s", s.nodeID, req.GroupID,
						raft.DescribeMessage(req.Message, s.EntryFormatter))
				}
				switch req.Message.Type {
				case raftpb.MsgHeartbeat:
					s.fanoutHeartbeat(req)
				case raftpb.MsgHeartbeatResp:
					s.fanoutHeartbeatResponse(proto.RaftNodeID(req.Message.From))
				default:
					// We only want to lazily create the group if it's not heartbeat-related;
					// our heartbeats are coalesced and contain a dummy GroupID.
					// TODO(tschottdorf) still shouldn't hurt to move this part outside,
					// but suddenly tests will start failing. Should investigate.
					if _, ok := s.groups[req.GroupID]; !ok {
						log.Infof("node %v: got message for unknown group %d; creating it", s.nodeID, req.GroupID)
						if err := s.createGroup(req.GroupID); err != nil {
							log.Warningf("Error creating group %d: %s", req.GroupID, err)
							break
						}
					}

					if err := s.multiNode.Step(context.Background(), uint64(req.GroupID), req.Message); err != nil {
						if log.V(4) {
							log.Infof("node %v: multinode step to group %v failed for message %.200s", s.nodeID, req.GroupID,
								raft.DescribeMessage(req.Message, s.EntryFormatter))
						}
					}
				}
			case op := <-s.createGroupChan:
				if log.V(6) {
					log.Infof("node %v: got op %#v", s.nodeID, op)
				}
				op.ch <- s.createGroup(op.groupID)

			case op := <-s.removeGroupChan:
				if log.V(6) {
					log.Infof("node %v: got op %#v", s.nodeID, op)
				}
				s.removeGroup(op, readyGroups)

			case prop := <-s.proposalChan:
				s.propose(prop)

			case readyGroups = <-raftReady:
				// readyGroups are saved in a local variable until they can be sent to
				// the write task (and then the real work happens after the write is
				// complete). All we do for now is log them.
				s.logRaftReady(readyGroups)

				select {
				case s.writeTask.ready <- struct{}{}:
				case <-s.stopper.ShouldStop():
					return
				}
				s.handleWriteReady(readyGroups)
				writingGroups = readyGroups
				readyGroups = nil

				select {
				case resp := <-s.writeTask.out:
					s.handleWriteResponse(resp, writingGroups)
					s.multiNode.Advance(writingGroups)
					writingGroups = nil
				case <-s.stopper.ShouldStop():
					return
				}

			case <-s.Ticker.Chan():
				if log.V(8) {
					log.Infof("node %v: got tick", s.nodeID)
				}
				s.multiNode.Tick()
				ticks++
				if ticks >= s.HeartbeatIntervalTicks {
					ticks = 0
					s.coalescedHeartbeat()
				}

			case cb := <-s.callbackChan:
				cb()
			}
		}
	})
}
Beispiel #14
0
func (s *state) handleWriteResponse(response *writeResponse, readyGroups map[uint64]raft.Ready) {
	log.V(6).Infof("node %v got write response: %#v", s.nodeID, *response)
	// Everything has been written to disk; now we can apply updates to the state machine
	// and send outgoing messages.
	for groupID, ready := range readyGroups {
		g, ok := s.groups[groupID]
		if !ok {
			log.V(4).Infof("dropping stale write to group %v", groupID)
			continue
		}
		for _, entry := range ready.CommittedEntries {
			var commandID string
			switch entry.Type {
			case raftpb.EntryNormal:
				// etcd raft occasionally adds a nil entry (e.g. upon election); ignore these.
				if entry.Data != nil {
					var command []byte
					commandID, command = decodeCommand(entry.Data)
					s.sendEvent(&EventCommandCommitted{
						GroupID:   groupID,
						CommandID: commandID,
						Command:   command,
					})
				}

			case raftpb.EntryConfChange:
				cc := raftpb.ConfChange{}
				err := cc.Unmarshal(entry.Data)
				if err != nil {
					log.Fatalf("invalid ConfChange data: %s", err)
				}
				var payload []byte
				if len(cc.Context) > 0 {
					commandID, payload = decodeCommand(cc.Context)
				}
				s.sendEvent(&EventMembershipChangeCommitted{
					GroupID:    groupID,
					CommandID:  commandID,
					NodeID:     NodeID(cc.NodeID),
					ChangeType: cc.Type,
					Payload:    payload,
					Callback: func(err error) {
						s.callbackChan <- func() {
							if err == nil {
								log.V(3).Infof("node %v applying configuration change %v", s.nodeID, cc)
								// TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs
								switch cc.Type {
								case raftpb.ConfChangeAddNode:
									err = s.addNode(NodeID(cc.NodeID), groupID)
								case raftpb.ConfChangeRemoveNode:
									// TODO(bdarnell): support removing nodes; fix double-application of initial entries
								case raftpb.ConfChangeUpdateNode:
									// Updates don't concern multiraft, they are simply passed through.
								}
								if err != nil {
									log.Errorf("error applying configuration change %v: %s", cc, err)
								}
								s.multiNode.ApplyConfChange(groupID, cc)
							} else {
								log.Warningf("aborting configuration change: %s", err)
								s.multiNode.ApplyConfChange(groupID,
									raftpb.ConfChange{})
							}

							// Re-submit all pending proposals, in case any of them were config changes
							// that were dropped due to the one-at-a-time rule. This is a little
							// redundant since most pending proposals won't benefit from this but
							// config changes should be rare enough (and the size of the pending queue
							// small enough) that it doesn't really matter.
							for _, prop := range g.pending {
								s.proposalChan <- prop
							}
						}
					},
				})
			}
			if p, ok := g.pending[commandID]; ok {
				// TODO(bdarnell): the command is now committed, but not applied until the
				// application consumes EventCommandCommitted. Is closing the channel
				// at this point useful or do we need to wait for the command to be
				// applied too?
				// This could be done with a Callback as in EventMembershipChangeCommitted
				// or perhaps we should move away from a channel to a callback-based system.
				if p.ch != nil {
					// Because of the way we re-queue proposals during leadership
					// changes, we may close the same proposal object twice.
					close(p.ch)
					p.ch = nil
				}
				delete(g.pending, commandID)
			}
		}

		noMoreHeartbeats := make(map[uint64]struct{})
		for _, msg := range ready.Messages {
			switch msg.Type {
			case raftpb.MsgHeartbeat:
				log.V(7).Infof("node %v dropped individual heartbeat to node %v",
					s.nodeID, msg.To)
				continue
			case raftpb.MsgHeartbeatResp:
				if _, ok := noMoreHeartbeats[msg.To]; ok {
					log.V(7).Infof("node %v dropped redundant heartbeat response to node %v",
						s.nodeID, msg.To)
					continue
				}
				noMoreHeartbeats[msg.To] = struct{}{}
			}

			log.V(6).Infof("node %v sending message %.200s to %v", s.nodeID,
				raft.DescribeMessage(msg, s.EntryFormatter), msg.To)
			nodeID := NodeID(msg.To)
			if _, ok := s.nodes[nodeID]; !ok {
				log.V(4).Infof("node %v: connecting to new node %v", s.nodeID, nodeID)
				if err := s.addNode(nodeID, groupID); err != nil {
					log.Errorf("node %v: error adding node %v", s.nodeID, nodeID)
				}
			}
			err := s.Transport.Send(NodeID(msg.To), &RaftMessageRequest{groupID, msg})
			snapStatus := raft.SnapshotFinish
			if err != nil {
				log.Warningf("node %v failed to send message to %v", s.nodeID, nodeID)
				s.multiNode.ReportUnreachable(msg.To, groupID)
				snapStatus = raft.SnapshotFailure
			}
			if msg.Type == raftpb.MsgSnap {
				// TODO(bdarnell): add an ack for snapshots and don't report status until
				// ack, error, or timeout.
				s.multiNode.ReportSnapshot(msg.To, groupID, snapStatus)
			}
		}
	}
}
Beispiel #15
0
func (s *state) start() {
	s.stopper.RunWorker(func() {
		defer s.stop()
		if log.V(1) {
			log.Infof("node %v starting", s.nodeID)
		}
		s.writeTask.start(s.stopper)
		// The maps s.readyGroups and writingGroups form a kind of state
		// machine: We don't want to read from the ready channel until the
		// groups we got from the last read have made their way through
		// the rest of the pipeline.
		var writingGroups map[uint64]raft.Ready
		// Counts up to heartbeat interval and is then reset.
		ticks := 0
		for {
			// raftReady signals that the Raft state machine has pending
			// work. That work is supplied over the raftReady channel as a map
			// from group ID to raft.Ready struct.
			var raftReady <-chan map[uint64]raft.Ready
			// writeReady is set to the write task's ready channel, which
			// receives when the write task is prepared to persist ready data
			// from the Raft state machine.
			// The writeReady mechanism is currently disabled as we are testing
			// performing all writes synchronously.
			// TODO(bdarnell): either reinstate writeReady or rip it out completely.
			//var writeReady chan struct{}
			var eventsChan chan []interface{}

			// The order of operations in this loop structure is as follows:
			// start by setting raftReady to the multiNode's Ready()
			// channel. Once a new raftReady has been consumed from the
			// channel, set writeReady to the write task's ready channel and
			// set raftReady back to nil. This advances our read-from-raft /
			// write-to-storage state machine to the next step: wait for the
			// write task to be ready to persist the new data.
			if s.readyGroups != nil {
				//writeReady = s.writeTask.ready
			} else if writingGroups == nil {
				raftReady = s.multiNode.Ready()
			}

			// If there is any pending events, then check the s.Events to see
			// if it's free to accept pending events.
			if len(s.pendingEvents) > 0 {
				eventsChan = s.Events
			}

			if log.V(8) {
				log.Infof("node %v: selecting", s.nodeID)
			}
			select {
			case <-s.stopper.ShouldStop():
				return

			case req := <-s.reqChan:
				if log.V(5) {
					log.Infof("node %v: group %v got message %.200s", s.nodeID, req.GroupID,
						raft.DescribeMessage(req.Message, s.EntryFormatter))
				}
				s.handleMessage(req)

			case op := <-s.createGroupChan:
				if log.V(6) {
					log.Infof("node %v: got op %#v", s.nodeID, op)
				}
				op.ch <- s.createGroup(op.groupID, 0)

			case op := <-s.removeGroupChan:
				if log.V(6) {
					log.Infof("node %v: got op %#v", s.nodeID, op)
				}
				op.ch <- s.removeGroup(op.groupID)

			case prop := <-s.proposalChan:
				s.propose(prop)

			case s.readyGroups = <-raftReady:
				// readyGroups are saved in a local variable until they can be sent to
				// the write task (and then the real work happens after the write is
				// complete). All we do for now is log them.
				s.logRaftReady()

				select {
				case s.writeTask.ready <- struct{}{}:
				case <-s.stopper.ShouldStop():
					return
				}
				s.handleWriteReady()
				writingGroups = s.readyGroups
				s.readyGroups = nil

				select {
				case resp := <-s.writeTask.out:
					s.handleWriteResponse(resp, writingGroups)
					s.multiNode.Advance(writingGroups)
					writingGroups = nil
				case <-s.stopper.ShouldStop():
					return
				}

			case <-s.Ticker.Chan():
				if log.V(8) {
					log.Infof("node %v: got tick", s.nodeID)
				}
				s.multiNode.Tick()
				ticks++
				if ticks >= s.HeartbeatIntervalTicks {
					ticks = 0
					s.coalescedHeartbeat()
				}

			case cb := <-s.callbackChan:
				if log.V(8) {
					log.Infof("node %v: got callback", s.nodeID)
				}
				cb()

			case eventsChan <- s.pendingEvents:
				if log.V(8) {
					log.Infof("node %v: send pendingEvents len %d", s.nodeID, len(s.pendingEvents))
				}
				s.pendingEvents = nil
			}
		}
	})
}
Beispiel #16
0
// fanoutHeartbeat sends the given heartbeat to all groups which believe that
// their leader resides on the sending node.
func (s *state) fanoutHeartbeat(req *RaftMessageRequest) {
	// A heartbeat message is expanded into a heartbeat for each group
	// that the remote node is a part of.
	fromID := roachpb.NodeID(req.Message.From)
	groupCount := 0
	followerCount := 0
	if originNode, ok := s.nodes[fromID]; ok {
		for groupID := range originNode.groupIDs {
			groupCount++
			// If we don't think that the sending node is leading that group, don't
			// propagate.
			if s.groups[groupID].leader.NodeID != fromID || fromID == s.nodeID {
				if log.V(8) {
					log.Infof("node %s: not fanning out heartbeat to %s, msg is from %s and leader is %s",
						s.nodeID, groupID, fromID, s.groups[groupID].leader)
				}
				continue
			}

			fromRepID, err := s.Storage.ReplicaIDForStore(groupID, req.FromReplica.StoreID)
			if err != nil {
				if log.V(3) {
					log.Infof("node %s: not fanning out heartbeat to %s, could not find replica id for sending store %s",
						s.nodeID, groupID, req.FromReplica.StoreID)
				}
				continue
			}

			toRepID, err := s.Storage.ReplicaIDForStore(groupID, req.ToReplica.StoreID)
			if err != nil {
				if log.V(3) {
					log.Infof("node %s: not fanning out heartbeat to %s, could not find replica id for receiving store %s",
						s.nodeID, groupID, req.ToReplica.StoreID)
				}
				continue
			}
			followerCount++

			groupMsg := raftpb.Message{
				Type: raftpb.MsgHeartbeat,
				To:   uint64(toRepID),
				From: uint64(fromRepID),
			}

			if err := s.multiNode.Step(context.Background(), uint64(groupID), groupMsg); err != nil {
				if log.V(4) {
					log.Infof("node %v: coalesced heartbeat step to group %v failed for message %s", s.nodeID, groupID,
						raft.DescribeMessage(req.Message, s.EntryFormatter))
				}
			}
		}
	}
	// We must respond whether we forwarded the heartbeat to any groups
	// or not. When a leader considers a follower to be down, it doesn't
	// begin recovery until the follower has successfully responded to a
	// heartbeat. If we get a heartbeat from a node we don't know, it
	// must think we are a follower of some group, so we need to respond
	// so it can activate the recovery process.
	s.sendMessage(nil,
		raftpb.Message{
			From: uint64(s.nodeID),
			To:   req.Message.From,
			Type: raftpb.MsgHeartbeatResp,
		})
	if log.V(7) {
		log.Infof("node %v: received coalesced heartbeat from node %v; "+
			"fanned out to %d followers in %d overlapping groups",
			s.nodeID, fromID, followerCount, groupCount)
	}
}
Beispiel #17
0
// sendMessage sends a raft message on the given group. Coalesced heartbeats
// address nodes, not groups; they will use the noGroup constant as groupID.
func (s *state) sendMessage(g *group, msg raftpb.Message) {
	if log.V(6) {
		log.Infof("node %v sending message %.200s to %v", s.nodeID,
			raft.DescribeMessage(msg, s.EntryFormatter), msg.To)
	}
	groupID := noGroup
	var toReplica roachpb.ReplicaDescriptor
	var fromReplica roachpb.ReplicaDescriptor
	if g == nil {
		// No group (a coalesced heartbeat): To/From fields are NodeIDs.
		// TODO(bdarnell): test transports route by store ID, not node ID.
		// In tests they're always the same, so we can hack it here but
		// it would be better to fix the transports.
		// I think we need to fix this before we can support a range
		// with two replicas on different stores of the same node.
		toReplica.NodeID = roachpb.NodeID(msg.To)
		toReplica.StoreID = roachpb.StoreID(msg.To)
		fromReplica.NodeID = roachpb.NodeID(msg.From)
		fromReplica.StoreID = roachpb.StoreID(msg.From)
	} else {
		// Regular message: To/From fields are replica IDs.
		groupID = g.id
		var err error
		toReplica, err = s.ReplicaDescriptor(groupID, roachpb.ReplicaID(msg.To))
		if err != nil {
			log.Warningf("failed to lookup recipient replica %d in group %d: %s", msg.To, groupID, err)
			return
		}
		fromReplica, err = s.ReplicaDescriptor(groupID, roachpb.ReplicaID(msg.From))
		if err != nil {
			log.Warningf("failed to lookup sender replica %d in group %d: %s", msg.From, groupID, err)
			return
		}
	}
	if _, ok := s.nodes[toReplica.NodeID]; !ok {
		if log.V(4) {
			log.Infof("node %v: connecting to new node %v", s.nodeID, toReplica.NodeID)
		}
		if err := s.addNode(toReplica.NodeID, g); err != nil {
			log.Errorf("node %v: error adding group %v to node %v: %v",
				s.nodeID, groupID, toReplica.NodeID, err)
		}
	}
	err := s.Transport.Send(&RaftMessageRequest{
		GroupID:     groupID,
		ToReplica:   toReplica,
		FromReplica: fromReplica,
		Message:     msg,
	})
	snapStatus := raft.SnapshotFinish
	if err != nil {
		log.Warningf("node %v failed to send message to %v: %s", s.nodeID, toReplica.NodeID, err)
		if groupID != noGroup {
			s.multiNode.ReportUnreachable(msg.To, uint64(groupID))
		}
		snapStatus = raft.SnapshotFailure
	}
	if msg.Type == raftpb.MsgSnap {
		// TODO(bdarnell): add an ack for snapshots and don't report status until
		// ack, error, or timeout.
		if groupID != noGroup {
			s.multiNode.ReportSnapshot(msg.To, uint64(groupID), snapStatus)
		}
	}
}