func (n *node) processCommitCh() { pending := make(chan struct{}, numPendingMutations) for e := range n.commitCh { if e.Data == nil { continue } if e.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange cc.Unmarshal(e.Data) if len(cc.Context) > 0 { var rc task.RaftContext x.Check(rc.Unmarshal(cc.Context)) n.Connect(rc.Id, rc.Addr) } n.raft.ApplyConfChange(cc) } else { go n.process(e, pending) } } }
func (n *node) run() { for { select { case <-n.ticker: n.raft.Tick() case rd := <-n.raft.Ready(): n.saveToStorage(rd.HardState, rd.Entries, rd.Snapshot) n.send(rd.Messages) if !raft.IsEmptySnap(rd.Snapshot) { n.processSnapshot(rd.Snapshot) } for _, entry := range rd.CommittedEntries { n.process(entry) if entry.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange cc.Unmarshal(entry.Data) n.raft.ApplyConfChange(cc) } } n.raft.Advance() case <-n.done: return } } }
func tryRaftLogEntry(kv engine.MVCCKeyValue) (string, error) { var ent raftpb.Entry if err := maybeUnmarshalInline(kv.Value, &ent); err != nil { return "", err } if ent.Type == raftpb.EntryNormal { if len(ent.Data) > 0 { _, cmdData := storage.DecodeRaftCommand(ent.Data) var cmd storagebase.RaftCommand if err := cmd.Unmarshal(cmdData); err != nil { return "", err } ent.Data = nil return fmt.Sprintf("%s by %v\n%s\n%s\n", &ent, cmd.OriginReplica, cmd.BatchRequest, &cmd), nil } return fmt.Sprintf("%s: EMPTY\n", &ent), nil } else if ent.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ent.Data); err != nil { return "", err } var ctx storage.ConfChangeContext if err := ctx.Unmarshal(cc.Context); err != nil { return "", err } var cmd storagebase.ReplicatedEvalResult if err := cmd.Unmarshal(ctx.Payload); err != nil { return "", err } ent.Data = nil return fmt.Sprintf("%s\n%s\n", &ent, &cmd), nil } return "", fmt.Errorf("unknown log entry type: %s", &ent) }
// getIDs returns an ordered set of IDs included in the given snapshot and // the entries. The given snapshot/entries can contain two kinds of // ID-related entry: // - ConfChangeAddNode, in which case the contained ID will be added into the set. // - ConfChangeRemoveNode, in which case the contained ID will be removed from the set. func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 { ids := make(map[uint64]bool) if snap != nil { for _, id := range snap.Metadata.ConfState.Nodes { ids[id] = true } } for _, e := range ents { if e.Type != raftpb.EntryConfChange { continue } if snap != nil && e.Index < snap.Metadata.Index { continue } var cc raftpb.ConfChange if err := cc.Unmarshal(e.Data); err != nil { log.L.WithError(err).Panic("unmarshal configuration change should never fail") } switch cc.Type { case raftpb.ConfChangeAddNode: ids[cc.NodeID] = true case raftpb.ConfChangeRemoveNode: delete(ids, cc.NodeID) case raftpb.ConfChangeUpdateNode: // do nothing default: log.L.Panic("ConfChange Type should be either ConfChangeAddNode or ConfChangeRemoveNode!") } } var sids []uint64 for id := range ids { sids = append(sids, id) } return sids }
// Start is the main loop for a Raft node, it // goes along the state machine, acting on the // messages received from other Raft nodes in // the cluster func (n *Node) Start() { for { select { case <-n.ticker.C: n.Tick() case rd := <-n.Ready(): n.saveToStorage(rd.HardState, rd.Entries, rd.Snapshot) n.send(rd.Messages) if !raft.IsEmptySnap(rd.Snapshot) { n.processSnapshot(rd.Snapshot) } for _, entry := range rd.CommittedEntries { n.process(entry) if entry.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange err := cc.Unmarshal(entry.Data) if err != nil { log.Fatal("raft: Can't unmarshal configuration change") } switch cc.Type { case raftpb.ConfChangeAddNode: n.applyAddNode(cc) case raftpb.ConfChangeRemoveNode: n.applyRemoveNode(cc) } n.ApplyConfChange(cc) } } n.Advance() case <-n.stopChan: n.Stop() n.Node = nil close(n.stopChan) return case pause := <-n.pauseChan: // FIXME lock hell n.SetPaused(pause) for n.pause { select { case pause = <-n.pauseChan: n.SetPaused(pause) } } n.pauseLock.Lock() // process pending messages for _, m := range n.rcvmsg { err := n.Step(n.Ctx, m) if err != nil { log.Fatal("Something went wrong when unpausing the node") } } n.rcvmsg = nil n.pauseLock.Unlock() } } }
// publishEntries writes committed log entries to commit channel and returns // whether all entries could be published. func (rc *raftNode) publishEntries(ents []raftpb.Entry) bool { for i := range ents { switch ents[i].Type { case raftpb.EntryNormal: if len(ents[i].Data) == 0 { // ignore empty messages break } s := string(ents[i].Data) select { case rc.commitC <- &s: case <-rc.stopc: return false } case raftpb.EntryConfChange: var cc raftpb.ConfChange cc.Unmarshal(ents[i].Data) rc.node.ApplyConfChange(cc) switch cc.Type { case raftpb.ConfChangeAddNode: if len(cc.Context) > 0 { rc.transport.AddPeer(types.ID(cc.NodeID), []string{string(cc.Context)}) } case raftpb.ConfChangeRemoveNode: if cc.NodeID == uint64(rc.id) { log.Println("I've been removed from the cluster! Shutting down.") return false } rc.transport.RemovePeer(types.ID(cc.NodeID)) } } // after commit, update appliedIndex rc.appliedIndex = ents[i].Index // special nil commit to signal replay has finished if ents[i].Index == rc.lastIndex { select { case rc.commitC <- nil: case <-rc.stopc: return false } } } return true }
// TestProposeAfterRemoveLeader ensures that we gracefully handle // proposals that are attempted after a leader has been removed from // the active configuration, but before that leader has called // MultiNode.RemoveGroup. func TestProposeAfterRemoveLeader(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() mn := newMultiNode(1) go mn.run() defer mn.Stop() storage := NewMemoryStorage() if err := mn.CreateGroup(1, newTestConfig(1, nil, 10, 1, storage), []Peer{{ID: 1}}); err != nil { t.Fatal(err) } if err := mn.Campaign(ctx, 1); err != nil { t.Fatal(err) } if err := mn.ProposeConfChange(ctx, 1, raftpb.ConfChange{ Type: raftpb.ConfChangeRemoveNode, NodeID: 1, }); err != nil { t.Fatal(err) } gs := <-mn.Ready() g := gs[1] if err := storage.Append(g.Entries); err != nil { t.Fatal(err) } for _, e := range g.CommittedEntries { if e.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(e.Data); err != nil { t.Fatal(err) } mn.ApplyConfChange(1, cc) } } mn.Advance(gs) if err := mn.Propose(ctx, 1, []byte("somedata")); err != nil { t.Errorf("err = %v, want nil", err) } }
func (n *node) process(entry raftpb.Entry) { fmt.Printf("node %v: processing entry", n.id) if entry.Data == nil { return } if entry.Type == raftpb.EntryConfChange { fmt.Printf("Configuration change\n") var cc raftpb.ConfChange cc.Unmarshal(entry.Data) n.raft.ApplyConfChange(cc) return } if entry.Type == raftpb.EntryNormal { parts := bytes.SplitN(entry.Data, []byte(":"), 2) k := string(parts[0]) v := string(parts[1]) n.data[k] = v fmt.Printf(" Key: %v Val: %v\n", k, v) } }
func (c *ctrl) readyApply(snapshot raftpb.Snapshot, committedEntries []raftpb.Entry) error { c.snapshotc <- snapshot for _, committedEntry := range committedEntries { c.entryc <- committedEntry if committedEntry.Type == raftpb.EntryConfChange { // See raftexample raftNode.publishEntries var cc raftpb.ConfChange if err := cc.Unmarshal(committedEntry.Data); err != nil { return fmt.Errorf("unmarshal ConfChange: %v", err) } c.node.ApplyConfChange(cc) if cc.Type == raftpb.ConfChangeRemoveNode && cc.NodeID == c.self.ID { return errors.New("got ConfChange that removed me from the cluster; terminating") } } } return nil }
// publishEntries writes committed log entries to commit channel and returns // whether all entries could be published. func (rc *raftNode) publishEntries(ents []raftpb.Entry) bool { for i := range ents { switch ents[i].Type { case raftpb.EntryNormal: if len(ents[i].Data) == 0 { // ignore conf changes and empty messages continue } s := string(ents[i].Data) select { case rc.commitC <- &s: case <-rc.stopc: return false } case raftpb.EntryConfChange: var cc raftpb.ConfChange cc.Unmarshal(ents[i].Data) rc.node.ApplyConfChange(cc) switch cc.Type { case raftpb.ConfChangeAddNode: if len(cc.Context) > 0 { rc.transport.AddPeer(types.ID(cc.NodeID), []string{string(cc.Context)}) } case raftpb.ConfChangeRemoveNode: if cc.NodeID == uint64(rc.id) { log.Println("I've been removed from the cluster! Shutting down.") return false } rc.transport.RemovePeer(types.ID(cc.NodeID)) } } } return true }
// processCommittedEntry tells the application that a command was committed. // Returns the commandID, or an empty string if the given entry was not a command. func (s *state) processCommittedEntry(groupID roachpb.RangeID, g *group, entry raftpb.Entry) string { var commandID string switch entry.Type { case raftpb.EntryNormal: var command []byte commandID, command = decodeCommand(entry.Data) s.sendEvent(&EventCommandCommitted{ GroupID: groupID, CommandID: commandID, Command: command, Index: entry.Index, }) case raftpb.EntryConfChange: cc := raftpb.ConfChange{} if err := cc.Unmarshal(entry.Data); err != nil { log.Fatalf("invalid ConfChange data: %s", err) } var payload []byte if len(cc.Context) > 0 { var ctx ConfChangeContext if err := ctx.Unmarshal(cc.Context); err != nil { log.Fatalf("invalid ConfChangeContext: %s", err) } commandID = ctx.CommandID payload = ctx.Payload s.CacheReplicaDescriptor(groupID, ctx.Replica) } replica, err := s.ReplicaDescriptor(groupID, roachpb.ReplicaID(cc.NodeID)) if err != nil { // TODO(bdarnell): stash Replica information somewhere so we can have it here // with no chance of failure. log.Fatalf("could not look up replica info (node %s, group %d, replica %d): %s", s.nodeID, groupID, cc.NodeID, err) } s.sendEvent(&EventMembershipChangeCommitted{ GroupID: groupID, CommandID: commandID, Index: entry.Index, Replica: replica, ChangeType: cc.Type, Payload: payload, Callback: func(err error) { select { case s.callbackChan <- func() { gInner, ok := s.groups[groupID] if !ok { log.Infof("group %d no longer exists, aborting configuration change", groupID) } else if gInner != g { log.Infof("passed in group and fetched group objects do not match\noriginal:%+v\nfetched:%+v\n, aborting configuration change", g, gInner) } else if err == nil { if log.V(3) { log.Infof("node %v applying configuration change %v", s.nodeID, cc) } // TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs switch cc.Type { case raftpb.ConfChangeAddNode: err = s.addNode(replica.NodeID, g) case raftpb.ConfChangeRemoveNode: err = s.removeNode(replica.NodeID, g) case raftpb.ConfChangeUpdateNode: // Updates don't concern multiraft, they are simply passed through. } if err != nil { log.Errorf("error applying configuration change %v: %s", cc, err) } g.raftGroup.ApplyConfChange(cc) } else { log.Warningf("aborting configuration change: %s", err) g.raftGroup.ApplyConfChange(raftpb.ConfChange{}) } }: case <-s.stopper.ShouldStop(): } }, }) } return commandID }
// processCommittedEntry tells the application that a command was committed. // Returns the commandID, or an empty string if the given entry was not a command. func (s *state) processCommittedEntry(groupID proto.RangeID, g *group, entry raftpb.Entry) string { var commandID string switch entry.Type { case raftpb.EntryNormal: // etcd raft occasionally adds a nil entry (e.g. upon election); ignore these. if entry.Data != nil { var command []byte commandID, command = decodeCommand(entry.Data) s.sendEvent(&EventCommandCommitted{ GroupID: groupID, CommandID: commandID, Command: command, Index: entry.Index, }) } case raftpb.EntryConfChange: cc := raftpb.ConfChange{} if err := cc.Unmarshal(entry.Data); err != nil { log.Fatalf("invalid ConfChange data: %s", err) } var payload []byte if len(cc.Context) > 0 { commandID, payload = decodeCommand(cc.Context) } g.waitForCallback = true s.sendEvent(&EventMembershipChangeCommitted{ GroupID: groupID, CommandID: commandID, Index: entry.Index, NodeID: proto.RaftNodeID(cc.NodeID), ChangeType: cc.Type, Payload: payload, Callback: func(err error) { select { case s.callbackChan <- func() { if err == nil { if log.V(3) { log.Infof("node %v applying configuration change %v", s.nodeID, cc) } // TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs switch cc.Type { case raftpb.ConfChangeAddNode: err = s.addNode(proto.RaftNodeID(cc.NodeID), g) case raftpb.ConfChangeRemoveNode: err = s.removeNode(proto.RaftNodeID(cc.NodeID), g) case raftpb.ConfChangeUpdateNode: // Updates don't concern multiraft, they are simply passed through. } if err != nil { log.Errorf("error applying configuration change %v: %s", cc, err) } s.multiNode.ApplyConfChange(uint64(groupID), cc) } else { log.Warningf("aborting configuration change: %s", err) s.multiNode.ApplyConfChange(uint64(groupID), raftpb.ConfChange{}) } // Re-submit all pending proposals that were held // while the config change was pending g.waitForCallback = false for _, prop := range g.pending { s.propose(prop) } }: case <-s.stopper.ShouldStop(): } }, }) } return commandID }
func (n *Node) readWAL(ctx context.Context, snapshot *raftpb.Snapshot, forceNewCluster bool) (err error) { var ( walsnap walpb.Snapshot metadata []byte st raftpb.HardState ents []raftpb.Entry ) if snapshot != nil { walsnap.Index = snapshot.Metadata.Index walsnap.Term = snapshot.Metadata.Term } repaired := false for { if n.wal, err = wal.Open(n.walDir(), walsnap); err != nil { return fmt.Errorf("open WAL error: %v", err) } if metadata, st, ents, err = n.wal.ReadAll(); err != nil { if err := n.wal.Close(); err != nil { return err } // we can only repair ErrUnexpectedEOF and we never repair twice. if repaired || err != io.ErrUnexpectedEOF { return fmt.Errorf("read WAL error (%v) and cannot be repaired", err) } if !wal.Repair(n.walDir()) { return fmt.Errorf("WAL error (%v) cannot be repaired", err) } log.G(ctx).Infof("repaired WAL error (%v)", err) repaired = true continue } break } defer func() { if err != nil { if walErr := n.wal.Close(); walErr != nil { n.Config.Logger.Errorf("error closing raft WAL: %v", walErr) } } }() var raftNode api.RaftMember if err := raftNode.Unmarshal(metadata); err != nil { return fmt.Errorf("error unmarshalling WAL metadata: %v", err) } n.Config.ID = raftNode.RaftID // All members that are no longer part of the cluster must be added to // the removed list right away, so that we don't try to connect to them // before processing the configuration change entries, which could make // us get stuck. for _, ent := range ents { if ent.Index <= st.Commit && ent.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ent.Data); err != nil { return fmt.Errorf("error unmarshalling config change: %v", err) } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } if forceNewCluster { // discard the previously uncommitted entries for i, ent := range ents { if ent.Index > st.Commit { log.G(context.Background()).Infof("discarding %d uncommitted WAL entries ", len(ents)-i) ents = ents[:i] break } } // force append the configuration change entries toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(n.Config.ID), st.Term, st.Commit) // All members that are being removed as part of the // force-new-cluster process must be added to the // removed list right away, so that we don't try to // connect to them before processing the configuration // change entries, which could make us get stuck. for _, ccEnt := range toAppEnts { if ccEnt.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ccEnt.Data); err != nil { return fmt.Errorf("error unmarshalling force-new-cluster config change: %v", err) } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } ents = append(ents, toAppEnts...) // force commit newly appended entries err := n.wal.Save(st, toAppEnts) if err != nil { log.G(context.Background()).Fatalf("%v", err) } if len(toAppEnts) != 0 { st.Commit = toAppEnts[len(toAppEnts)-1].Index } } if snapshot != nil { if err := n.raftStore.ApplySnapshot(*snapshot); err != nil { return err } } if err := n.raftStore.SetHardState(st); err != nil { return err } if err := n.raftStore.Append(ents); err != nil { return err } return nil }
// processCommittedEntry tells the application that a command was committed. // Returns the commandID, or an empty string if the given entry was not a command. func (s *state) processCommittedEntry(groupID roachpb.RangeID, g *group, entry raftpb.Entry) string { var commandID string switch entry.Type { case raftpb.EntryNormal: // etcd raft occasionally adds a nil entry (e.g. upon election); ignore these. if entry.Data != nil { var command []byte commandID, command = decodeCommand(entry.Data) s.sendEvent(&EventCommandCommitted{ GroupID: groupID, CommandID: commandID, Command: command, Index: entry.Index, }) } case raftpb.EntryConfChange: cc := raftpb.ConfChange{} if err := cc.Unmarshal(entry.Data); err != nil { log.Fatalf("invalid ConfChange data: %s", err) } var payload []byte if len(cc.Context) > 0 { var ctx ConfChangeContext if err := ctx.Unmarshal(cc.Context); err != nil { log.Fatalf("invalid ConfChangeContext: %s", err) } commandID = ctx.CommandID payload = ctx.Payload s.CacheReplicaDescriptor(groupID, ctx.Replica) } replica, err := s.ReplicaDescriptor(groupID, roachpb.ReplicaID(cc.NodeID)) if err != nil { // TODO(bdarnell): stash Replica information somewhere so we can have it here // with no chance of failure. log.Fatalf("could not look up replica info (node %s, group %d, replica %d): %s", s.nodeID, groupID, cc.NodeID, err) } g.waitForCallback++ s.sendEvent(&EventMembershipChangeCommitted{ GroupID: groupID, CommandID: commandID, Index: entry.Index, Replica: replica, ChangeType: cc.Type, Payload: payload, Callback: func(err error) { var errStr string if err != nil { errStr = err.Error() // can't leak err into the callback } select { case s.callbackChan <- func() { if errStr == "" { if log.V(3) { log.Infof("node %v applying configuration change %v", s.nodeID, cc) } // TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs var err error switch cc.Type { case raftpb.ConfChangeAddNode: err = s.addNode(replica.NodeID, g) case raftpb.ConfChangeRemoveNode: err = s.removeNode(replica.NodeID, g) case raftpb.ConfChangeUpdateNode: // Updates don't concern multiraft, they are simply passed through. } if err != nil { log.Errorf("error applying configuration change %v: %s", cc, err) } s.multiNode.ApplyConfChange(uint64(groupID), cc) } else { log.Warningf("aborting configuration change: %s", errStr) s.multiNode.ApplyConfChange(uint64(groupID), raftpb.ConfChange{}) } // Re-submit all pending proposals that were held // while the config change was pending g.waitForCallback-- if g.waitForCallback <= 0 { for _, prop := range g.pending { s.propose(prop) } } }: case <-s.stopper.ShouldStop(): } }, }) } return commandID }
func (s *state) handleWriteResponse(response *writeResponse, readyGroups map[uint64]raft.Ready) { log.V(6).Infof("node %v got write response: %#v", s.nodeID, *response) // Everything has been written to disk; now we can apply updates to the state machine // and send outgoing messages. for groupID, ready := range readyGroups { g, ok := s.groups[groupID] if !ok { log.V(4).Infof("dropping stale write to group %v", groupID) continue } for _, entry := range ready.CommittedEntries { var commandID string switch entry.Type { case raftpb.EntryNormal: // etcd raft occasionally adds a nil entry (e.g. upon election); ignore these. if entry.Data != nil { var command []byte commandID, command = decodeCommand(entry.Data) s.sendEvent(&EventCommandCommitted{ GroupID: groupID, CommandID: commandID, Command: command, }) } case raftpb.EntryConfChange: cc := raftpb.ConfChange{} err := cc.Unmarshal(entry.Data) if err != nil { log.Fatalf("invalid ConfChange data: %s", err) } var payload []byte if len(cc.Context) > 0 { commandID, payload = decodeCommand(cc.Context) } s.sendEvent(&EventMembershipChangeCommitted{ GroupID: groupID, CommandID: commandID, NodeID: NodeID(cc.NodeID), ChangeType: cc.Type, Payload: payload, Callback: func(err error) { s.callbackChan <- func() { if err == nil { log.V(3).Infof("node %v applying configuration change %v", s.nodeID, cc) // TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs switch cc.Type { case raftpb.ConfChangeAddNode: err = s.addNode(NodeID(cc.NodeID), groupID) case raftpb.ConfChangeRemoveNode: // TODO(bdarnell): support removing nodes; fix double-application of initial entries case raftpb.ConfChangeUpdateNode: // Updates don't concern multiraft, they are simply passed through. } if err != nil { log.Errorf("error applying configuration change %v: %s", cc, err) } s.multiNode.ApplyConfChange(groupID, cc) } else { log.Warningf("aborting configuration change: %s", err) s.multiNode.ApplyConfChange(groupID, raftpb.ConfChange{}) } // Re-submit all pending proposals, in case any of them were config changes // that were dropped due to the one-at-a-time rule. This is a little // redundant since most pending proposals won't benefit from this but // config changes should be rare enough (and the size of the pending queue // small enough) that it doesn't really matter. for _, prop := range g.pending { s.proposalChan <- prop } } }, }) } if p, ok := g.pending[commandID]; ok { // TODO(bdarnell): the command is now committed, but not applied until the // application consumes EventCommandCommitted. Is closing the channel // at this point useful or do we need to wait for the command to be // applied too? // This could be done with a Callback as in EventMembershipChangeCommitted // or perhaps we should move away from a channel to a callback-based system. if p.ch != nil { // Because of the way we re-queue proposals during leadership // changes, we may close the same proposal object twice. close(p.ch) p.ch = nil } delete(g.pending, commandID) } } noMoreHeartbeats := make(map[uint64]struct{}) for _, msg := range ready.Messages { switch msg.Type { case raftpb.MsgHeartbeat: log.V(7).Infof("node %v dropped individual heartbeat to node %v", s.nodeID, msg.To) continue case raftpb.MsgHeartbeatResp: if _, ok := noMoreHeartbeats[msg.To]; ok { log.V(7).Infof("node %v dropped redundant heartbeat response to node %v", s.nodeID, msg.To) continue } noMoreHeartbeats[msg.To] = struct{}{} } log.V(6).Infof("node %v sending message %.200s to %v", s.nodeID, raft.DescribeMessage(msg, s.EntryFormatter), msg.To) nodeID := NodeID(msg.To) if _, ok := s.nodes[nodeID]; !ok { log.V(4).Infof("node %v: connecting to new node %v", s.nodeID, nodeID) if err := s.addNode(nodeID, groupID); err != nil { log.Errorf("node %v: error adding node %v", s.nodeID, nodeID) } } err := s.Transport.Send(NodeID(msg.To), &RaftMessageRequest{groupID, msg}) snapStatus := raft.SnapshotFinish if err != nil { log.Warningf("node %v failed to send message to %v", s.nodeID, nodeID) s.multiNode.ReportUnreachable(msg.To, groupID) snapStatus = raft.SnapshotFailure } if msg.Type == raftpb.MsgSnap { // TODO(bdarnell): add an ack for snapshots and don't report status until // ack, error, or timeout. s.multiNode.ReportSnapshot(msg.To, groupID, snapStatus) } } } }
func (n *node) start() { n.stopc = make(chan struct{}) ticker := time.Tick(100 * time.Millisecond) go func() { for { select { case <-ticker: n.Tick() //fmt.Println("node_id:", n.id) case rd := <-n.Ready(): if !raft.IsEmptyHardState(rd.HardState) { n.state = rd.HardState n.storage.SetHardState(n.state) } n.storage.Append(rd.Entries) //fmt.Printf("------Node_ID:%v---------\n", n.id) //n.storage.Dump() time.Sleep(time.Millisecond) //fmt.Println("rd ready") // TODO: make send async, more like real world... for _, m := range rd.Messages { n.iface.send(m) } //Process Snapshot if !raft.IsEmptySnap(rd.Snapshot) { n.storage.ApplySnapshot(rd.Snapshot) } // for _, entry := range rd.CommittedEntries { //process(entry) if entry.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange cc.Unmarshal(entry.Data) st := n.Node.ApplyConfChange(cc) fmt.Printf("CommittedEntries state: %v\n", st.String()) } } n.Advance() case m := <-n.iface.recv(): n.Step(context.TODO(), m) //fmt.Printf("recv:%v\n", m) case <-n.stopc: n.Stop() log.Printf("raft.%d: stop\n", n.id) n.Node = nil close(n.stopc) return case p := <-n.pausec: recvms := make([]raftpb.Message, 0) for p { select { case m := <-n.iface.recv(): recvms = append(recvms, m) case p = <-n.pausec: } } // step all pending messages for _, m := range recvms { n.Step(context.TODO(), m) } } } }() }
// bootstraps a node's raft store from the raft logs and snapshots on disk func (n *Node) loadAndStart(ctx context.Context, forceNewCluster bool) error { snapshot, waldata, err := n.readFromDisk(ctx) if err != nil { return err } // Read logs to fully catch up store var raftNode api.RaftMember if err := raftNode.Unmarshal(waldata.Metadata); err != nil { return errors.Wrap(err, "failed to unmarshal WAL metadata") } n.Config.ID = raftNode.RaftID if snapshot != nil { snapCluster, err := n.clusterSnapshot(snapshot.Data) if err != nil { return err } var bootstrapMembers []*api.RaftMember if forceNewCluster { for _, m := range snapCluster.Members { if m.RaftID != n.Config.ID { n.cluster.RemoveMember(m.RaftID) continue } bootstrapMembers = append(bootstrapMembers, m) } } else { bootstrapMembers = snapCluster.Members } n.bootstrapMembers = bootstrapMembers for _, removedMember := range snapCluster.Removed { n.cluster.RemoveMember(removedMember) } } ents, st := waldata.Entries, waldata.HardState // All members that are no longer part of the cluster must be added to // the removed list right away, so that we don't try to connect to them // before processing the configuration change entries, which could make // us get stuck. for _, ent := range ents { if ent.Index <= st.Commit && ent.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ent.Data); err != nil { return errors.Wrap(err, "failed to unmarshal config change") } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } if forceNewCluster { // discard the previously uncommitted entries for i, ent := range ents { if ent.Index > st.Commit { log.G(ctx).Infof("discarding %d uncommitted WAL entries", len(ents)-i) ents = ents[:i] break } } // force append the configuration change entries toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), n.Config.ID, st.Term, st.Commit) // All members that are being removed as part of the // force-new-cluster process must be added to the // removed list right away, so that we don't try to // connect to them before processing the configuration // change entries, which could make us get stuck. for _, ccEnt := range toAppEnts { if ccEnt.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ccEnt.Data); err != nil { return errors.Wrap(err, "error unmarshalling force-new-cluster config change") } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } ents = append(ents, toAppEnts...) // force commit newly appended entries err := n.raftLogger.SaveEntries(st, toAppEnts) if err != nil { log.G(ctx).WithError(err).Fatalf("failed to save WAL while forcing new cluster") } if len(toAppEnts) != 0 { st.Commit = toAppEnts[len(toAppEnts)-1].Index } } if snapshot != nil { if err := n.raftStore.ApplySnapshot(*snapshot); err != nil { return err } } if err := n.raftStore.SetHardState(st); err != nil { return err } return n.raftStore.Append(ents) }
// run is the CSP-style main of raftLog; all local struct fields (except // channels) belong exclusively to run while it is running. Method invocations // are signaled through channels. func (l *raftLog) run() { defer close(l.waitCommitted) defer close(l.stopped) defer close(l.leaderHintSet) ticker := l.clk.Ticker(l.tickInterval) for { select { case <-l.stop: return case <-ticker.C: l.node.Tick() case r := <-l.grpcDropClient: delete(l.grpcClientCache, r) case rd := <-l.node.Ready(): if !raft.IsEmptySnap(rd.Snapshot) { log.Panicf("snapshots not supported") } l.config.Storage.(*raftStorage).save(rd.HardState, rd.Entries) for i := range rd.Messages { l.send(&rd.Messages[i]) } for _, entry := range rd.CommittedEntries { switch entry.Type { case raftpb.EntryConfChange: var cc raftpb.ConfChange cc.Unmarshal(entry.Data) var op replication.ConfChangeType switch { case cc.NodeID == raft.None: op = replication.ConfChangeNOP case cc.Type == raftpb.ConfChangeAddNode: op = replication.ConfChangeAddNode case cc.Type == raftpb.ConfChangeRemoveNode: op = replication.ConfChangeRemoveNode case cc.Type == raftpb.ConfChangeUpdateNode: op = replication.ConfChangeUpdateNode default: panic("unknown conf change type from raft") } select { case l.waitCommitted <- replication.LogEntry{ Data: cc.Context, ConfChange: &replication.ConfChange{ Operation: op, NodeID: cc.NodeID, }, }: case <-l.stop: return } default: select { case l.waitCommitted <- replication.LogEntry{Data: entry.Data}: case <-l.stop: return } } } if rd.SoftState != nil { leaderHint := rd.SoftState.RaftState == raft.StateLeader if l.leaderHint != leaderHint { l.leaderHint = leaderHint select { case l.leaderHintSet <- leaderHint: default: } } } l.node.Advance() } } }
func (c *configurator) loop() { defer c.logger.Printf("configurator: loop exit") ticker := time.NewTicker(time.Second) defer ticker.Stop() var ( pendingAdd = uint64set{} pendingRem = uint64set{} ) for { select { case id := <-c.addc: if pendingAdd.has(id) { c.logger.Printf("configurator: recv add %x, was pending add already", id) } else { c.logger.Printf("configurator: recv add %x, now pending add", id) pendingAdd.add(id) // We *must* wait before emitting a ConfChange. // https://github.com/coreos/etcd/issues/4759 } case id := <-c.remc: if pendingRem.has(id) { c.logger.Printf("configurator: recv rem %x, was pending rem already", id) } else { c.logger.Printf("configurator: recv rem %x, now pending rem", id) pendingRem.add(id) // We *must* wait before emitting a ConfChange. // https://github.com/coreos/etcd/issues/4759 } case <-ticker.C: for id := range pendingAdd { c.logger.Printf("configurator: send ConfChangeAddNode %x", id) c.confchangec <- raftpb.ConfChange{ Type: raftpb.ConfChangeAddNode, NodeID: id, } } for id := range pendingRem { c.logger.Printf("configurator: send ConfChangeRemoveNode %x", id) c.confchangec <- raftpb.ConfChange{ Type: raftpb.ConfChangeRemoveNode, NodeID: id, } } case entry := <-c.entryc: if entry.Type != raftpb.EntryConfChange { c.logger.Printf("configurator: ignoring %s", entry.Type) continue } var cc raftpb.ConfChange if err := cc.Unmarshal(entry.Data); err != nil { c.logger.Printf("configurator: got invalid ConfChange (%v); ignoring", err) continue } switch cc.Type { case raftpb.ConfChangeAddNode: if _, ok := pendingAdd[cc.NodeID]; ok { c.logger.Printf("configurator: recv %s %x: was pending add, deleting", cc.Type, cc.NodeID) delete(pendingAdd, cc.NodeID) } else { c.logger.Printf("configurator: recv %s %x: not pending add, ignoring", cc.Type, cc.NodeID) } case raftpb.ConfChangeRemoveNode: if _, ok := pendingRem[cc.NodeID]; ok { c.logger.Printf("configurator: recv %s %x: was pending rem, deleting", cc.Type, cc.NodeID) delete(pendingRem, cc.NodeID) } else { c.logger.Printf("configurator: recv %s %x: not pending rem, ignoring", cc.Type, cc.NodeID) } } case <-c.quitc: return } } }
func (mn *multinode) start() { mn.stopc = make(chan struct{}) ticker := time.Tick(100 * time.Millisecond) go func() { for { select { case op := <-mn.createGroupChan: fmt.Println("mn.createGroupChan") op.ch <- mn.MultiNode.CreateGroup(op.groupID, op.config, op.peers) case op := <-mn.removeGroupChan: fmt.Println("mn.removeGroupChan") op.ch <- mn.MultiNode.RemoveGroup(op.groupID) case <-ticker: mn.Tick() //fmt.Println("node_id:%v", mn.nodeid) case rds := <-mn.Ready(): for group_id, rd := range rds { if !raft.IsEmptyHardState(rd.HardState) { mn.state = rd.HardState mn.storage.SetHardState(mn.state) } mn.storage.Append(rd.Entries) //fmt.Printf("------Node_ID:%v---------\n", n.id) //n.storage.Dump() time.Sleep(time.Millisecond) //fmt.Println("rd ready") // TODO: make send async, more like real world... for _, m := range rd.Messages { fmt.Printf("处理消息 groupId:%v msg:%v\n", group_id, m) mm := multiMessage{group: group_id, msg: m} mn.network.nodeNetwork(mn.nodeid).send(mm) } //Process Snapshot if !raft.IsEmptySnap(rd.Snapshot) { mn.storage.ApplySnapshot(rd.Snapshot) } // for _, entry := range rd.CommittedEntries { //process(entry) if entry.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange cc.Unmarshal(entry.Data) mn.MultiNode.ApplyConfChange(group_id, cc) //fmt.Printf("CommittedEntries state: %v\n", st.String()) } } } mn.Advance(rds) case m := <-mn.network.nodeNetwork(mn.nodeid).recv(): fmt.Println("recv...") mn.Step(context.TODO(), m.group, m.msg) /*for { select { case m := <-mn.iface.recv(): fmt.Println("iface recv") default: //fmt.Println("default") } time.Sleep(10*time.Millisecond) } */ //fmt.Printf("recv:%v\n", m) case <-mn.stopc: mn.Stop() log.Printf("raft.%d: stop\n", mn.nodeid) mn.MultiNode = nil close(mn.stopc) return case p := <-mn.pausec: recvms := make([]multiMessage, 0) for p { select { case m := <-mn.network.nodeNetwork(mn.nodeid).recv(): recvms = append(recvms, m) case p = <-mn.pausec: } } // step all pending messages for _, m := range recvms { mn.Step(context.TODO(), m.group, m.msg) } } } }() }
func (s *EtcdServer) run() { var syncC <-chan time.Time // snapi indicates the index of the last submitted snapshot request var snapi, appliedi int64 for { select { case <-s.ticker: s.node.Tick() case rd := <-s.node.Ready(): s.storage.Save(rd.HardState, rd.Entries) s.storage.SaveSnap(rd.Snapshot) s.send(rd.Messages) // TODO(bmizerany): do this in the background, but take // care to apply entries in a single goroutine, and not // race them. // TODO: apply configuration change into ClusterStore. for _, e := range rd.CommittedEntries { switch e.Type { case raftpb.EntryNormal: var r pb.Request if err := r.Unmarshal(e.Data); err != nil { panic("TODO: this is bad, what do we do about it?") } s.w.Trigger(r.ID, s.apply(r)) case raftpb.EntryConfChange: var cc raftpb.ConfChange if err := cc.Unmarshal(e.Data); err != nil { panic("TODO: this is bad, what do we do about it?") } s.node.ApplyConfChange(cc) s.w.Trigger(cc.ID, nil) default: panic("unexpected entry type") } atomic.StoreInt64(&s.raftIndex, e.Index) atomic.StoreInt64(&s.raftTerm, e.Term) appliedi = e.Index } if rd.Snapshot.Index > snapi { snapi = rd.Snapshot.Index } // recover from snapshot if it is more updated than current applied if rd.Snapshot.Index > appliedi { if err := s.store.Recovery(rd.Snapshot.Data); err != nil { panic("TODO: this is bad, what do we do about it?") } appliedi = rd.Snapshot.Index } if appliedi-snapi > s.snapCount { s.snapshot() snapi = appliedi } if rd.SoftState != nil { if rd.RaftState == raft.StateLeader { syncC = s.syncTicker } else { syncC = nil } if rd.SoftState.ShouldStop { s.Stop() return } } case <-syncC: s.sync(defaultSyncTimeout) case <-s.done: return } } }
// processCommittedEntry tells the application that a command was committed. // Returns the commandID, or an empty string if the given entry was not a command. func (s *state) processCommittedEntry(groupID proto.RaftID, g *group, entry raftpb.Entry) string { var commandID string switch entry.Type { case raftpb.EntryNormal: // etcd raft occasionally adds a nil entry (e.g. upon election); ignore these. if entry.Data != nil { var command []byte commandID, command = decodeCommand(entry.Data) s.sendEvent(&EventCommandCommitted{ GroupID: groupID, CommandID: commandID, Command: command, Index: entry.Index, }) } case raftpb.EntryConfChange: cc := raftpb.ConfChange{} if err := cc.Unmarshal(entry.Data); err != nil { log.Fatalf("invalid ConfChange data: %s", err) } var payload []byte if len(cc.Context) > 0 { commandID, payload = decodeCommand(cc.Context) } s.sendEvent(&EventMembershipChangeCommitted{ GroupID: groupID, CommandID: commandID, Index: entry.Index, NodeID: proto.RaftNodeID(cc.NodeID), ChangeType: cc.Type, Payload: payload, Callback: func(err error) { select { case s.callbackChan <- func() { if err == nil { if log.V(3) { log.Infof("node %v applying configuration change %v", s.nodeID, cc) } // TODO(bdarnell): dedupe by keeping a record of recently-applied commandIDs switch cc.Type { case raftpb.ConfChangeAddNode: err = s.addNode(proto.RaftNodeID(cc.NodeID), proto.RaftID(groupID)) case raftpb.ConfChangeRemoveNode: // TODO(bdarnell): support removing nodes; fix double-application of initial entries case raftpb.ConfChangeUpdateNode: // Updates don't concern multiraft, they are simply passed through. } if err != nil { log.Errorf("error applying configuration change %v: %s", cc, err) } s.multiNode.ApplyConfChange(uint64(groupID), cc) } else { log.Warningf("aborting configuration change: %s", err) s.multiNode.ApplyConfChange(uint64(groupID), raftpb.ConfChange{}) } // Re-submit all pending proposals, in case any of them were config changes // that were dropped due to the one-at-a-time rule. This is a little // redundant since most pending proposals won't benefit from this but // config changes should be rare enough (and the size of the pending queue // small enough) that it doesn't really matter. for _, prop := range g.pending { s.propose(prop) } }: case <-s.stopper.ShouldStop(): } }, }) } return commandID }
// TestNodeProposeAddDuplicateNode ensures that two proposes to add the same node should // not affect the later propose to add new node. func TestNodeProposeAddDuplicateNode(t *testing.T) { n := newNode() s := NewMemoryStorage() r := newTestRaft(1, []uint64{1}, 10, 1, s) go n.run(r) n.Campaign(context.TODO()) rdyEntries := make([]raftpb.Entry, 0) ticker := time.NewTicker(time.Millisecond * 100) done := make(chan struct{}) stop := make(chan struct{}) applyConfChan := make(chan struct{}) go func() { defer close(done) for { select { case <-stop: return case <-ticker.C: n.Tick() case rd := <-n.Ready(): s.Append(rd.Entries) for _, e := range rd.Entries { rdyEntries = append(rdyEntries, e) switch e.Type { case raftpb.EntryNormal: case raftpb.EntryConfChange: var cc raftpb.ConfChange cc.Unmarshal(e.Data) n.ApplyConfChange(cc) applyConfChan <- struct{}{} } } n.Advance() } } }() cc1 := raftpb.ConfChange{Type: raftpb.ConfChangeAddNode, NodeID: 1} ccdata1, _ := cc1.Marshal() n.ProposeConfChange(context.TODO(), cc1) <-applyConfChan // try add the same node again n.ProposeConfChange(context.TODO(), cc1) <-applyConfChan // the new node join should be ok cc2 := raftpb.ConfChange{Type: raftpb.ConfChangeAddNode, NodeID: 2} ccdata2, _ := cc2.Marshal() n.ProposeConfChange(context.TODO(), cc2) <-applyConfChan close(stop) <-done if len(rdyEntries) != 4 { t.Errorf("len(entry) = %d, want %d, %v\n", len(rdyEntries), 3, rdyEntries) } if !bytes.Equal(rdyEntries[1].Data, ccdata1) { t.Errorf("data = %v, want %v", rdyEntries[1].Data, ccdata1) } if !bytes.Equal(rdyEntries[3].Data, ccdata2) { t.Errorf("data = %v, want %v", rdyEntries[3].Data, ccdata2) } n.Stop() }