func logRaftReady(storeID roachpb.StoreID, groupID roachpb.RangeID, ready raft.Ready) { if log.V(5) { // Globally synchronize to avoid interleaving different sets of logs in tests. logRaftReadyMu.Lock() defer logRaftReadyMu.Unlock() log.Infof("store %s: group %s raft ready", storeID, groupID) if ready.SoftState != nil { log.Infof("SoftState updated: %+v", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { log.Infof("HardState updated: %+v", ready.HardState) } for i, e := range ready.Entries { log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, raftEntryFormatter)) } for i, e := range ready.CommittedEntries { log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, raftEntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { log.Infof("Snapshot updated: %.200s", ready.Snapshot.String()) } for i, m := range ready.Messages { log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, raftEntryFormatter)) } } }
func (s *state) logRaftReady(readyGroups map[uint64]raft.Ready) { for groupID, ready := range readyGroups { if log.V(5) { log.Infof("node %v: group %v raft ready", s.nodeID, groupID) if ready.SoftState != nil { log.Infof("SoftState updated: %+v", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { log.Infof("HardState updated: %+v", ready.HardState) } for i, e := range ready.Entries { log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } for i, e := range ready.CommittedEntries { log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { log.Infof("Snapshot updated: %.200s", ready.Snapshot.String()) } for i, m := range ready.Messages { log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, s.EntryFormatter)) } } } }
// fanoutHeartbeatResponse sends the given heartbeat response to all groups // which overlap with the sender's groups and consider themselves leader. func (s *state) fanoutHeartbeatResponse(req *RaftMessageRequest) { fromID := NodeID(req.Message.From) originNode, ok := s.nodes[fromID] if !ok { log.Warningf("node %v: not fanning out heartbeat response from unknown node %v", s.nodeID, fromID) return } cnt := 0 for groupID := range originNode.groupIDs { // If we don't think that the local node is leader, don't propagate. if s.groups[groupID].leader != s.nodeID || fromID == s.nodeID { log.V(8).Infof("node %v: not fanning out heartbeat response to %v, msg is from %d and leader is %d", s.nodeID, req.Message.To, fromID, s.groups[groupID].leader) continue } if err := s.multiNode.Step(context.Background(), groupID, req.Message); err != nil { log.V(4).Infof("node %v: coalesced heartbeat response step failed for message %s", s.nodeID, groupID, raft.DescribeMessage(req.Message, s.EntryFormatter)) } cnt++ } log.V(7).Infof("node %v: received coalesced heartbeat response from node %v; "+ "fanned out to %d leaders in %d overlapping groups", s.nodeID, fromID, cnt, len(originNode.groupIDs)) }
func logRaftReady(ctx context.Context, prefix fmt.Stringer, ready raft.Ready) { if log.V(5) { var buf bytes.Buffer if ready.SoftState != nil { fmt.Fprintf(&buf, " SoftState updated: %+v\n", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { fmt.Fprintf(&buf, " HardState updated: %+v\n", ready.HardState) } for i, e := range ready.Entries { fmt.Fprintf(&buf, " New Entry[%d]: %.200s\n", i, raft.DescribeEntry(e, raftEntryFormatter)) } for i, e := range ready.CommittedEntries { fmt.Fprintf(&buf, " Committed Entry[%d]: %.200s\n", i, raft.DescribeEntry(e, raftEntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { fmt.Fprintf(&buf, " Snapshot updated: %.200s\n", ready.Snapshot.String()) } for i, m := range ready.Messages { fmt.Fprintf(&buf, " Outgoing Message[%d]: %.200s\n", i, raft.DescribeMessage(m, raftEntryFormatter)) } log.Infof(ctx, "%s raft ready\n%s", prefix, buf.String()) } }
func (n *node) send(messages []raftpb.Message) { for _, m := range messages { log.Println(raft.DescribeMessage(m, nil)) // send message to other node //nodes[int(m.To)].receive(n.ctx, m) go sendOverNetwork(n.ctx, m) } }
func (s *state) handleRaftReady(readyGroups map[uint64]raft.Ready) { // Soft state is updated immediately; everything else waits for handleWriteReady. for groupID, ready := range readyGroups { if log.V(5) { log.Infof("node %v: group %v raft ready", s.nodeID, groupID) if ready.SoftState != nil { log.Infof("SoftState updated: %+v", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { log.Infof("HardState updated: %+v", ready.HardState) } for i, e := range ready.Entries { log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } for i, e := range ready.CommittedEntries { log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { log.Infof("Snapshot updated: %.200s", ready.Snapshot.String()) } for i, m := range ready.Messages { log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, s.EntryFormatter)) } } g, ok := s.groups[groupID] if !ok { // This is a stale message for a removed group log.V(4).Infof("node %v: dropping stale ready message for group %v", s.nodeID, groupID) continue } term := g.committedTerm if ready.SoftState != nil { // Always save the leader whenever we get a SoftState. g.leader = NodeID(ready.SoftState.Lead) } if len(ready.CommittedEntries) > 0 { term = ready.CommittedEntries[len(ready.CommittedEntries)-1].Term } if term != g.committedTerm && g.leader != 0 { // Whenever the committed term has advanced and we know our leader, // emit an event. g.committedTerm = term s.sendEvent(&EventLeaderElection{ GroupID: groupID, NodeID: NodeID(g.leader), Term: g.committedTerm, }) // Re-submit all pending proposals for _, prop := range g.pending { s.proposalChan <- prop } } } }
// send messages to peers. func (n *node) send(messages []raftpb.Message) { for _, m := range messages { log.Println("SEND: ", raft.DescribeMessage(m, nil)) nodes[int(m.To)].receive(context.TODO(), m) if m.Type == raftpb.MsgSnap { // n.raft.ReportSnapshot(m.To, raft.SnapshotFinish) } } }
func (s *state) handleMessage(req *RaftMessageRequest) { // We only want to lazily create the group if it's not heartbeat-related; // our heartbeats are coalesced and contain a dummy GroupID. switch req.Message.Type { case raftpb.MsgHeartbeat: s.fanoutHeartbeat(req) return case raftpb.MsgHeartbeatResp: s.fanoutHeartbeatResponse(req) return } s.CacheReplicaDescriptor(req.GroupID, req.FromReplica) s.CacheReplicaDescriptor(req.GroupID, req.ToReplica) if g, ok := s.groups[req.GroupID]; ok { if g.replicaID > req.ToReplica.ReplicaID { log.Warningf("node %v: got message for group %s with stale replica ID %s (expected %s)", s.nodeID, req.GroupID, req.ToReplica.ReplicaID, g.replicaID) return } else if g.replicaID < req.ToReplica.ReplicaID { // The message has a newer ReplicaID than we know about. This // means that this node has been removed from a group and // re-added to it, before our GC process was able to remove the // remnants of the old group. log.Infof("node %v: got message for group %s with newer replica ID (%s vs %s), recreating group", s.nodeID, req.GroupID, req.ToReplica.ReplicaID, g.replicaID) if err := s.removeGroup(req.GroupID); err != nil { log.Warningf("Error removing group %d (in response to incoming message): %s", req.GroupID, err) return } if err := s.createGroup(req.GroupID, req.ToReplica.ReplicaID); err != nil { log.Warningf("Error recreating group %d (in response to incoming message): %s", req.GroupID, err) return } } } else { if log.V(1) { log.Infof("node %v: got message for unknown group %d; creating it", s.nodeID, req.GroupID) } if err := s.createGroup(req.GroupID, req.ToReplica.ReplicaID); err != nil { log.Warningf("Error creating group %d (in response to incoming message): %s", req.GroupID, err) return } } if err := s.multiNode.Step(context.Background(), uint64(req.GroupID), req.Message); err != nil { if log.V(4) { log.Infof("node %v: multinode step to group %v failed for message %.200s", s.nodeID, req.GroupID, raft.DescribeMessage(req.Message, s.EntryFormatter)) } } }
// fanoutHeartbeat sends the given heartbeat to all groups which believe that // their leader resides on the sending node. func (s *state) fanoutHeartbeat(req *RaftMessageRequest) { // A heartbeat message is expanded into a heartbeat for each group // that the remote node is a part of. fromID := proto.RaftNodeID(req.Message.From) originNode, ok := s.nodes[fromID] if !ok { // When a leader considers a follower to be down, it doesn't begin recovery // until the follower has successfully responded to a heartbeat. If we get a // heartbeat from a node we don't know, it must think we are a follower of // some group, so we need to respond so it can activate the recovery process. log.Warningf("node %v: not fanning out heartbeat from unknown node %v (but responding anyway)", s.nodeID, fromID) s.sendMessage(noGroup, raftpb.Message{ From: uint64(s.nodeID), To: req.Message.From, Type: raftpb.MsgHeartbeatResp, }) return } cnt := 0 for groupID := range originNode.groupIDs { // If we don't think that the sending node is leading that group, don't // propagate. if s.groups[groupID].leader != fromID || fromID == s.nodeID { if log.V(8) { log.Infof("node %v: not fanning out heartbeat to %v, msg is from %d and leader is %d", s.nodeID, req.Message.To, fromID, s.groups[groupID].leader) } continue } if err := s.multiNode.Step(context.Background(), uint64(groupID), req.Message); err != nil { if log.V(4) { log.Infof("node %v: coalesced heartbeat step to group %v failed for message %s", s.nodeID, groupID, raft.DescribeMessage(req.Message, s.EntryFormatter)) } } cnt++ } if cnt > 0 { s.sendMessage(noGroup, raftpb.Message{ From: uint64(s.nodeID), To: req.Message.From, Type: raftpb.MsgHeartbeatResp, }) } if log.V(7) { log.Infof("node %v: received coalesced heartbeat from node %v; "+ "fanned out to %d followers in %d overlapping groups", s.nodeID, fromID, cnt, len(originNode.groupIDs)) } }
// sendMessage sends a raft message on the given group. Coalesced heartbeats // address nodes, not groups; they will use the noGroup constant as groupID. func (s *state) sendMessage(groupID proto.RaftID, msg raftpb.Message) { if log.V(6) { log.Infof("node %v sending message %.200s to %v", s.nodeID, raft.DescribeMessage(msg, s.EntryFormatter), msg.To) } nodeID := proto.RaftNodeID(msg.To) if _, ok := s.nodes[nodeID]; !ok { if log.V(4) { log.Infof("node %v: connecting to new node %v", s.nodeID, nodeID) } var err error if groupID != noGroup { err = s.addNode(nodeID, groupID) } else { err = s.addNode(nodeID) } if err != nil { log.Errorf("node %v: error adding group %v to node %v: %v", s.nodeID, groupID, nodeID, err) } } err := s.Transport.Send(&RaftMessageRequest{groupID, msg}) snapStatus := raft.SnapshotFinish if err != nil { log.Warningf("node %v failed to send message to %v: %s", s.nodeID, nodeID, err) if groupID != noGroup { s.multiNode.ReportUnreachable(msg.To, uint64(groupID)) } snapStatus = raft.SnapshotFailure } if msg.Type == raftpb.MsgSnap { // TODO(bdarnell): add an ack for snapshots and don't report status until // ack, error, or timeout. if groupID != noGroup { s.multiNode.ReportSnapshot(msg.To, uint64(groupID), snapStatus) } } }
func (s *state) start() { log.V(1).Infof("node %v starting", s.nodeID) go s.writeTask.start() // These maps form a kind of state machine: We don't want to read from the // ready channel until the groups we got from the last read have made their // way through the rest of the pipeline. var readyGroups map[uint64]raft.Ready var writingGroups map[uint64]raft.Ready // Counts up to heartbeat interval and is then reset. ticks := 0 for { // raftReady signals that the Raft state machine has pending // work. That work is supplied over the raftReady channel as a map // from group ID to raft.Ready struct. var raftReady <-chan map[uint64]raft.Ready // writeReady is set to the write task's ready channel, which // receives when the write task is prepared to persist ready data // from the Raft state machine. var writeReady chan struct{} // The order of operations in this loop structure is as follows: // start by setting raftReady to the multiNode's Ready() // channel. Once a new raftReady has been consumed from the // channel, set writeReady to the write task's ready channel and // set raftReady back to nil. This advances our read-from-raft / // write-to-storage state machine to the next step: wait for the // write task to be ready to persist the new data. if readyGroups != nil { writeReady = s.writeTask.ready } else if writingGroups == nil { raftReady = s.multiNode.Ready() } log.V(8).Infof("node %v: selecting", s.nodeID) select { case <-s.stopper.ShouldStop(): log.V(6).Infof("node %v: stopping", s.nodeID) s.stop() return case req := <-s.reqChan: log.V(5).Infof("node %v: group %v got message %.200s", s.nodeID, req.GroupID, raft.DescribeMessage(req.Message, s.EntryFormatter)) switch req.Message.Type { case raftpb.MsgHeartbeat: s.fanoutHeartbeat(req) case raftpb.MsgHeartbeatResp: s.fanoutHeartbeatResponse(req) default: if _, ok := s.groups[req.GroupID]; !ok { log.Infof("node %v: got message for unknown group %d; creating it", s.nodeID, req.GroupID) if err := s.createGroup(req.GroupID); err != nil { log.Warningf("Error creating group %d: %s", req.GroupID, err) break } } if err := s.multiNode.Step(context.Background(), req.GroupID, req.Message); err != nil { log.V(4).Infof("node %v: multinode step failed for message %s", s.nodeID, req.GroupID, raft.DescribeMessage(req.Message, s.EntryFormatter)) } } case op := <-s.createGroupChan: log.V(6).Infof("node %v: got op %#v", s.nodeID, op) op.ch <- s.createGroup(op.groupID) case op := <-s.removeGroupChan: log.V(6).Infof("node %v: got op %#v", s.nodeID, op) s.removeGroup(op) case prop := <-s.proposalChan: s.propose(prop) case readyGroups = <-raftReady: s.handleRaftReady(readyGroups) case writeReady <- struct{}{}: s.handleWriteReady(readyGroups) writingGroups = readyGroups readyGroups = nil case resp := <-s.writeTask.out: s.handleWriteResponse(resp, writingGroups) s.multiNode.Advance(writingGroups) writingGroups = nil case <-s.Ticker.Chan(): log.V(8).Infof("node %v: got tick", s.nodeID) s.multiNode.Tick() ticks++ if ticks >= s.HeartbeatIntervalTicks { ticks = 0 s.coalescedHeartbeat() } case cb := <-s.callbackChan: cb() } } }
func (s *state) start() { s.stopper.RunWorker(func() { defer s.stop() if log.V(1) { log.Infof("node %v starting", s.nodeID) } s.writeTask.start(s.stopper) // Counts up to heartbeat interval and is then reset. ticks := 0 // checkReadyGroupIDs keeps track of all the groupIDs which // should be checked if there is any pending Ready for that group. checkReadyGroupIDs := map[roachpb.RangeID]struct{}{} for { // writeReady is set to the write task's ready channel, which // receives when the write task is prepared to persist ready data // from the Raft state machine. var writeReady chan struct{} var eventsChan chan []interface{} // If there is pending check, then check if writeTask is ready to do its work. if len(checkReadyGroupIDs) > 0 { writeReady = s.writeTask.ready } // If there is any pending events, then check the s.Events to see // if it's free to accept pending events. if len(s.pendingEvents) > 0 { eventsChan = s.Events } if log.V(8) { log.Infof("node %v: selecting", s.nodeID) } select { case <-s.stopper.ShouldStop(): return case req := <-s.reqChan: if log.V(5) { log.Infof("node %v: group %v got message %.200s", s.nodeID, req.GroupID, raft.DescribeMessage(req.Message, s.EntryFormatter)) } s.handleMessage(req) checkReadyGroupIDs[req.GroupID] = struct{}{} case op := <-s.createGroupChan: if log.V(6) { log.Infof("node %v: got op %#v", s.nodeID, op) } op.ch <- s.createGroup(op.groupID, 0) checkReadyGroupIDs[op.groupID] = struct{}{} case op := <-s.removeGroupChan: if log.V(6) { log.Infof("node %v: got op %#v", s.nodeID, op) } op.ch <- s.removeGroup(op.groupID, op.replicaID) case prop := <-s.proposalChan: s.propose(prop) checkReadyGroupIDs[prop.groupID] = struct{}{} case groupID := <-s.campaignChan: if log.V(6) { log.Infof("node %v: got campaign for group %#v", s.nodeID, groupID) } if _, ok := s.groups[groupID]; !ok { if err := s.createGroup(groupID, 0); err != nil { log.Warningf("node %s failed to create group %s during MultiRaft.Campaign: %s", s.nodeID, groupID, err) continue } } if err := s.groups[groupID].raftGroup.Campaign(); err != nil { log.Warningf("node %s failed to campaign for group %s: %s", s.nodeID, groupID, err) } checkReadyGroupIDs[groupID] = struct{}{} case writeReady <- struct{}{}: writingGroups := s.handleWriteReady(checkReadyGroupIDs) checkReadyGroupIDs = map[roachpb.RangeID]struct{}{} // No Ready for any group if len(writingGroups) == 0 { break } s.logRaftReady(writingGroups) select { case resp := <-s.writeTask.out: s.handleWriteResponse(resp, writingGroups) // Advance rd and check these groups again. // handleWriteResponse may call s.proposal to generate new Ready. for groupID, rd := range writingGroups { if g, ok := s.groups[groupID]; ok { g.raftGroup.Advance(rd) checkReadyGroupIDs[groupID] = struct{}{} } } case <-s.stopper.ShouldStop(): return } case <-s.Ticker.Chan(): if log.V(8) { log.Infof("node %v: got tick", s.nodeID) } for groupID, g := range s.groups { g.raftGroup.Tick() checkReadyGroupIDs[groupID] = struct{}{} } ticks++ if ticks >= s.HeartbeatIntervalTicks { ticks = 0 s.coalescedHeartbeat() } case cb := <-s.callbackChan: if log.V(8) { log.Infof("node %v: got callback", s.nodeID) } cb() case eventsChan <- s.pendingEvents: if log.V(8) { log.Infof("node %v: send pendingEvents len %d", s.nodeID, len(s.pendingEvents)) } s.pendingEvents = nil case statusOp := <-s.statusChan: if log.V(8) { log.Infof("node %v: receive status groupID %d", s.nodeID, statusOp.groupID) } g := s.groups[statusOp.groupID] if g == nil { log.Infof("node %v: g is nil for statusOp for groupID %v", s.nodeID, statusOp.groupID) statusOp.ch <- nil break } statusOp.ch <- g.raftGroup.Status() } } }) }
func (s *state) start() { s.stopper.RunWorker(func() { defer func() { if log.V(6) { log.Infof("node %v: stopping", s.nodeID) } s.stop() }() if log.V(1) { log.Infof("node %v starting", s.nodeID) } s.writeTask.start(s.stopper) // These maps form a kind of state machine: We don't want to read from the // ready channel until the groups we got from the last read have made their // way through the rest of the pipeline. var readyGroups map[uint64]raft.Ready var writingGroups map[uint64]raft.Ready // Counts up to heartbeat interval and is then reset. ticks := 0 for { // raftReady signals that the Raft state machine has pending // work. That work is supplied over the raftReady channel as a map // from group ID to raft.Ready struct. var raftReady <-chan map[uint64]raft.Ready // writeReady is set to the write task's ready channel, which // receives when the write task is prepared to persist ready data // from the Raft state machine. // The writeReady mechanism is currently disabled as we are testing // performing all writes synchronously. // TODO(bdarnell): either reinstate writeReady or rip it out completely. //var writeReady chan struct{} // The order of operations in this loop structure is as follows: // start by setting raftReady to the multiNode's Ready() // channel. Once a new raftReady has been consumed from the // channel, set writeReady to the write task's ready channel and // set raftReady back to nil. This advances our read-from-raft / // write-to-storage state machine to the next step: wait for the // write task to be ready to persist the new data. if readyGroups != nil { //writeReady = s.writeTask.ready } else if writingGroups == nil { raftReady = s.multiNode.Ready() } if log.V(8) { log.Infof("node %v: selecting", s.nodeID) } select { case <-s.stopper.ShouldStop(): return case req := <-s.reqChan: if log.V(5) { log.Infof("node %v: group %v got message %.200s", s.nodeID, req.GroupID, raft.DescribeMessage(req.Message, s.EntryFormatter)) } switch req.Message.Type { case raftpb.MsgHeartbeat: s.fanoutHeartbeat(req) case raftpb.MsgHeartbeatResp: s.fanoutHeartbeatResponse(proto.RaftNodeID(req.Message.From)) default: // We only want to lazily create the group if it's not heartbeat-related; // our heartbeats are coalesced and contain a dummy GroupID. // TODO(tschottdorf) still shouldn't hurt to move this part outside, // but suddenly tests will start failing. Should investigate. if _, ok := s.groups[req.GroupID]; !ok { log.Infof("node %v: got message for unknown group %d; creating it", s.nodeID, req.GroupID) if err := s.createGroup(req.GroupID); err != nil { log.Warningf("Error creating group %d: %s", req.GroupID, err) break } } if err := s.multiNode.Step(context.Background(), uint64(req.GroupID), req.Message); err != nil { if log.V(4) { log.Infof("node %v: multinode step to group %v failed for message %.200s", s.nodeID, req.GroupID, raft.DescribeMessage(req.Message, s.EntryFormatter)) } } } case op := <-s.createGroupChan: if log.V(6) { log.Infof("node %v: got op %#v", s.nodeID, op) } op.ch <- s.createGroup(op.groupID) case op := <-s.removeGroupChan: if log.V(6) { log.Infof("node %v: got op %#v", s.nodeID, op) } s.removeGroup(op, readyGroups) case prop := <-s.proposalChan: s.propose(prop) case readyGroups = <-raftReady: // readyGroups are saved in a local variable until they can be sent to // the write task (and then the real work happens after the write is // complete). All we do for now is log them. s.logRaftReady(readyGroups) select { case s.writeTask.ready <- struct{}{}: case <-s.stopper.ShouldStop(): return } s.handleWriteReady(readyGroups) writingGroups = readyGroups readyGroups = nil select { case resp := <-s.writeTask.out: s.handleWriteResponse(resp, writingGroups) s.multiNode.Advance(writingGroups) writingGroups = nil case <-s.stopper.ShouldStop(): return } case <-s.Ticker.Chan(): if log.V(8) { log.Infof("node %v: got tick", s.nodeID) } s.multiNode.Tick() ticks++ if ticks >= s.HeartbeatIntervalTicks { ticks = 0 s.coalescedHeartbeat() } case cb := <-s.callbackChan: cb() } } }) }
func (s *state) handleWriteResponse(response *writeResponse, readyGroups map[uint64]raft.Ready) { log.V(6).Infof("node %v got write response: %#v", s.nodeID, *response) // Everything has been written to disk; now we can apply updates to the state machine // and send outgoing messages. for groupID, ready := range readyGroups { g, ok := s.groups[groupID] if !ok { log.V(4).Infof("dropping stale write to group %v", groupID) continue } for _, entry := range ready.CommittedEntries { var commandID string switch entry.Type { case raftpb.EntryNormal: // etcd raft occasionally adds a nil entry (e.g. upon election); ignore these. if entry.Data != nil { var command []byte commandID, command = decodeCommand(entry.Data) s.sendEvent(&EventCommandCommitted{ GroupID: groupID, CommandID: commandID, Command: command, }) } case raftpb.EntryConfChange: cc := raftpb.ConfChange{} err := cc.Unmarshal(entry.Data) if err != nil { log.Fatalf("invalid ConfChange data: %s", err) } var payload []byte if len(cc.Context) > 0 { commandID, payload = decodeCommand(cc.Context) } s.sendEvent(&EventMembershipChangeCommitted{ GroupID: groupID, CommandID: commandID, NodeID: NodeID(cc.NodeID), ChangeType: cc.Type, Payload: payload, Callback: func(err error) { s.callbackChan <- func() { if err == nil { log.V(3).Infof("node %v applying configuration change %v", s.nodeID, cc) // TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs switch cc.Type { case raftpb.ConfChangeAddNode: err = s.addNode(NodeID(cc.NodeID), groupID) case raftpb.ConfChangeRemoveNode: // TODO(bdarnell): support removing nodes; fix double-application of initial entries case raftpb.ConfChangeUpdateNode: // Updates don't concern multiraft, they are simply passed through. } if err != nil { log.Errorf("error applying configuration change %v: %s", cc, err) } s.multiNode.ApplyConfChange(groupID, cc) } else { log.Warningf("aborting configuration change: %s", err) s.multiNode.ApplyConfChange(groupID, raftpb.ConfChange{}) } // Re-submit all pending proposals, in case any of them were config changes // that were dropped due to the one-at-a-time rule. This is a little // redundant since most pending proposals won't benefit from this but // config changes should be rare enough (and the size of the pending queue // small enough) that it doesn't really matter. for _, prop := range g.pending { s.proposalChan <- prop } } }, }) } if p, ok := g.pending[commandID]; ok { // TODO(bdarnell): the command is now committed, but not applied until the // application consumes EventCommandCommitted. Is closing the channel // at this point useful or do we need to wait for the command to be // applied too? // This could be done with a Callback as in EventMembershipChangeCommitted // or perhaps we should move away from a channel to a callback-based system. if p.ch != nil { // Because of the way we re-queue proposals during leadership // changes, we may close the same proposal object twice. close(p.ch) p.ch = nil } delete(g.pending, commandID) } } noMoreHeartbeats := make(map[uint64]struct{}) for _, msg := range ready.Messages { switch msg.Type { case raftpb.MsgHeartbeat: log.V(7).Infof("node %v dropped individual heartbeat to node %v", s.nodeID, msg.To) continue case raftpb.MsgHeartbeatResp: if _, ok := noMoreHeartbeats[msg.To]; ok { log.V(7).Infof("node %v dropped redundant heartbeat response to node %v", s.nodeID, msg.To) continue } noMoreHeartbeats[msg.To] = struct{}{} } log.V(6).Infof("node %v sending message %.200s to %v", s.nodeID, raft.DescribeMessage(msg, s.EntryFormatter), msg.To) nodeID := NodeID(msg.To) if _, ok := s.nodes[nodeID]; !ok { log.V(4).Infof("node %v: connecting to new node %v", s.nodeID, nodeID) if err := s.addNode(nodeID, groupID); err != nil { log.Errorf("node %v: error adding node %v", s.nodeID, nodeID) } } err := s.Transport.Send(NodeID(msg.To), &RaftMessageRequest{groupID, msg}) snapStatus := raft.SnapshotFinish if err != nil { log.Warningf("node %v failed to send message to %v", s.nodeID, nodeID) s.multiNode.ReportUnreachable(msg.To, groupID) snapStatus = raft.SnapshotFailure } if msg.Type == raftpb.MsgSnap { // TODO(bdarnell): add an ack for snapshots and don't report status until // ack, error, or timeout. s.multiNode.ReportSnapshot(msg.To, groupID, snapStatus) } } } }
func (s *state) start() { s.stopper.RunWorker(func() { defer s.stop() if log.V(1) { log.Infof("node %v starting", s.nodeID) } s.writeTask.start(s.stopper) // The maps s.readyGroups and writingGroups form a kind of state // machine: We don't want to read from the ready channel until the // groups we got from the last read have made their way through // the rest of the pipeline. var writingGroups map[uint64]raft.Ready // Counts up to heartbeat interval and is then reset. ticks := 0 for { // raftReady signals that the Raft state machine has pending // work. That work is supplied over the raftReady channel as a map // from group ID to raft.Ready struct. var raftReady <-chan map[uint64]raft.Ready // writeReady is set to the write task's ready channel, which // receives when the write task is prepared to persist ready data // from the Raft state machine. // The writeReady mechanism is currently disabled as we are testing // performing all writes synchronously. // TODO(bdarnell): either reinstate writeReady or rip it out completely. //var writeReady chan struct{} var eventsChan chan []interface{} // The order of operations in this loop structure is as follows: // start by setting raftReady to the multiNode's Ready() // channel. Once a new raftReady has been consumed from the // channel, set writeReady to the write task's ready channel and // set raftReady back to nil. This advances our read-from-raft / // write-to-storage state machine to the next step: wait for the // write task to be ready to persist the new data. if s.readyGroups != nil { //writeReady = s.writeTask.ready } else if writingGroups == nil { raftReady = s.multiNode.Ready() } // If there is any pending events, then check the s.Events to see // if it's free to accept pending events. if len(s.pendingEvents) > 0 { eventsChan = s.Events } if log.V(8) { log.Infof("node %v: selecting", s.nodeID) } select { case <-s.stopper.ShouldStop(): return case req := <-s.reqChan: if log.V(5) { log.Infof("node %v: group %v got message %.200s", s.nodeID, req.GroupID, raft.DescribeMessage(req.Message, s.EntryFormatter)) } s.handleMessage(req) case op := <-s.createGroupChan: if log.V(6) { log.Infof("node %v: got op %#v", s.nodeID, op) } op.ch <- s.createGroup(op.groupID, 0) case op := <-s.removeGroupChan: if log.V(6) { log.Infof("node %v: got op %#v", s.nodeID, op) } op.ch <- s.removeGroup(op.groupID) case prop := <-s.proposalChan: s.propose(prop) case s.readyGroups = <-raftReady: // readyGroups are saved in a local variable until they can be sent to // the write task (and then the real work happens after the write is // complete). All we do for now is log them. s.logRaftReady() select { case s.writeTask.ready <- struct{}{}: case <-s.stopper.ShouldStop(): return } s.handleWriteReady() writingGroups = s.readyGroups s.readyGroups = nil select { case resp := <-s.writeTask.out: s.handleWriteResponse(resp, writingGroups) s.multiNode.Advance(writingGroups) writingGroups = nil case <-s.stopper.ShouldStop(): return } case <-s.Ticker.Chan(): if log.V(8) { log.Infof("node %v: got tick", s.nodeID) } s.multiNode.Tick() ticks++ if ticks >= s.HeartbeatIntervalTicks { ticks = 0 s.coalescedHeartbeat() } case cb := <-s.callbackChan: if log.V(8) { log.Infof("node %v: got callback", s.nodeID) } cb() case eventsChan <- s.pendingEvents: if log.V(8) { log.Infof("node %v: send pendingEvents len %d", s.nodeID, len(s.pendingEvents)) } s.pendingEvents = nil } } }) }
// fanoutHeartbeat sends the given heartbeat to all groups which believe that // their leader resides on the sending node. func (s *state) fanoutHeartbeat(req *RaftMessageRequest) { // A heartbeat message is expanded into a heartbeat for each group // that the remote node is a part of. fromID := roachpb.NodeID(req.Message.From) groupCount := 0 followerCount := 0 if originNode, ok := s.nodes[fromID]; ok { for groupID := range originNode.groupIDs { groupCount++ // If we don't think that the sending node is leading that group, don't // propagate. if s.groups[groupID].leader.NodeID != fromID || fromID == s.nodeID { if log.V(8) { log.Infof("node %s: not fanning out heartbeat to %s, msg is from %s and leader is %s", s.nodeID, groupID, fromID, s.groups[groupID].leader) } continue } fromRepID, err := s.Storage.ReplicaIDForStore(groupID, req.FromReplica.StoreID) if err != nil { if log.V(3) { log.Infof("node %s: not fanning out heartbeat to %s, could not find replica id for sending store %s", s.nodeID, groupID, req.FromReplica.StoreID) } continue } toRepID, err := s.Storage.ReplicaIDForStore(groupID, req.ToReplica.StoreID) if err != nil { if log.V(3) { log.Infof("node %s: not fanning out heartbeat to %s, could not find replica id for receiving store %s", s.nodeID, groupID, req.ToReplica.StoreID) } continue } followerCount++ groupMsg := raftpb.Message{ Type: raftpb.MsgHeartbeat, To: uint64(toRepID), From: uint64(fromRepID), } if err := s.multiNode.Step(context.Background(), uint64(groupID), groupMsg); err != nil { if log.V(4) { log.Infof("node %v: coalesced heartbeat step to group %v failed for message %s", s.nodeID, groupID, raft.DescribeMessage(req.Message, s.EntryFormatter)) } } } } // We must respond whether we forwarded the heartbeat to any groups // or not. When a leader considers a follower to be down, it doesn't // begin recovery until the follower has successfully responded to a // heartbeat. If we get a heartbeat from a node we don't know, it // must think we are a follower of some group, so we need to respond // so it can activate the recovery process. s.sendMessage(nil, raftpb.Message{ From: uint64(s.nodeID), To: req.Message.From, Type: raftpb.MsgHeartbeatResp, }) if log.V(7) { log.Infof("node %v: received coalesced heartbeat from node %v; "+ "fanned out to %d followers in %d overlapping groups", s.nodeID, fromID, followerCount, groupCount) } }
// sendMessage sends a raft message on the given group. Coalesced heartbeats // address nodes, not groups; they will use the noGroup constant as groupID. func (s *state) sendMessage(g *group, msg raftpb.Message) { if log.V(6) { log.Infof("node %v sending message %.200s to %v", s.nodeID, raft.DescribeMessage(msg, s.EntryFormatter), msg.To) } groupID := noGroup var toReplica roachpb.ReplicaDescriptor var fromReplica roachpb.ReplicaDescriptor if g == nil { // No group (a coalesced heartbeat): To/From fields are NodeIDs. // TODO(bdarnell): test transports route by store ID, not node ID. // In tests they're always the same, so we can hack it here but // it would be better to fix the transports. // I think we need to fix this before we can support a range // with two replicas on different stores of the same node. toReplica.NodeID = roachpb.NodeID(msg.To) toReplica.StoreID = roachpb.StoreID(msg.To) fromReplica.NodeID = roachpb.NodeID(msg.From) fromReplica.StoreID = roachpb.StoreID(msg.From) } else { // Regular message: To/From fields are replica IDs. groupID = g.id var err error toReplica, err = s.ReplicaDescriptor(groupID, roachpb.ReplicaID(msg.To)) if err != nil { log.Warningf("failed to lookup recipient replica %d in group %d: %s", msg.To, groupID, err) return } fromReplica, err = s.ReplicaDescriptor(groupID, roachpb.ReplicaID(msg.From)) if err != nil { log.Warningf("failed to lookup sender replica %d in group %d: %s", msg.From, groupID, err) return } } if _, ok := s.nodes[toReplica.NodeID]; !ok { if log.V(4) { log.Infof("node %v: connecting to new node %v", s.nodeID, toReplica.NodeID) } if err := s.addNode(toReplica.NodeID, g); err != nil { log.Errorf("node %v: error adding group %v to node %v: %v", s.nodeID, groupID, toReplica.NodeID, err) } } err := s.Transport.Send(&RaftMessageRequest{ GroupID: groupID, ToReplica: toReplica, FromReplica: fromReplica, Message: msg, }) snapStatus := raft.SnapshotFinish if err != nil { log.Warningf("node %v failed to send message to %v: %s", s.nodeID, toReplica.NodeID, err) if groupID != noGroup { s.multiNode.ReportUnreachable(msg.To, uint64(groupID)) } snapStatus = raft.SnapshotFailure } if msg.Type == raftpb.MsgSnap { // TODO(bdarnell): add an ack for snapshots and don't report status until // ack, error, or timeout. if groupID != noGroup { s.multiNode.ReportSnapshot(msg.To, uint64(groupID), snapStatus) } } }