// This is a fork of raft.DescribeMessage with a tweak to avoid logging // snapshot data. func raftDescribeMessage(m raftpb.Message, f raft.EntryFormatter) string { var buf bytes.Buffer fmt.Fprintf(&buf, "%x->%x %v Term:%d Log:%d/%d", m.From, m.To, m.Type, m.Term, m.LogTerm, m.Index) if m.Reject { fmt.Fprintf(&buf, " Rejected") if m.RejectHint != 0 { fmt.Fprintf(&buf, "(Hint:%d)", m.RejectHint) } } if m.Commit != 0 { fmt.Fprintf(&buf, " Commit:%d", m.Commit) } if len(m.Entries) > 0 { fmt.Fprintf(&buf, " Entries:[") for i, e := range m.Entries { if i != 0 { buf.WriteString(", ") } buf.WriteString(raft.DescribeEntry(e, f)) } fmt.Fprintf(&buf, "]") } if !raft.IsEmptySnap(m.Snapshot) { snap := m.Snapshot snap.Data = nil fmt.Fprintf(&buf, " Snapshot:%v", snap) } return buf.String() }
func (n *Node) start() { tk := time.Tick(5 * time.Millisecond) for { select { case <-tk: n.Tick() case rd := <-n.Ready(): if !raft.IsEmptyHardState(rd.HardState) { n.state = rd.HardState n.storage.SetHardState(n.state) } n.storage.Append(rd.Entries) n.send(rd.Messages) if !raft.IsEmptySnap(rd.Snapshot) { n.storage.ApplySnapshot(rd.Snapshot) } time.Sleep(time.Millisecond) for _, entry := range rd.CommittedEntries { n.process(entry) // if entry.Type == raftpb.EntryConfChange { // } // var cc raftpb.ConfChange // cc.Unmarshal(entry.Data) // n.ApplyConfChange(cc) } n.Advance() case m := <-n.receive(): n.Step(context.TODO(), m) } } }
func logRaftReady(storeID roachpb.StoreID, groupID roachpb.RangeID, ready raft.Ready) { if log.V(5) { // Globally synchronize to avoid interleaving different sets of logs in tests. logRaftReadyMu.Lock() defer logRaftReadyMu.Unlock() log.Infof("store %s: group %s raft ready", storeID, groupID) if ready.SoftState != nil { log.Infof("SoftState updated: %+v", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { log.Infof("HardState updated: %+v", ready.HardState) } for i, e := range ready.Entries { log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, raftEntryFormatter)) } for i, e := range ready.CommittedEntries { log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, raftEntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { log.Infof("Snapshot updated: %.200s", ready.Snapshot.String()) } for i, m := range ready.Messages { log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, raftEntryFormatter)) } } }
func logRaftReady(ctx context.Context, prefix fmt.Stringer, ready raft.Ready) { if log.V(5) { var buf bytes.Buffer if ready.SoftState != nil { fmt.Fprintf(&buf, " SoftState updated: %+v\n", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { fmt.Fprintf(&buf, " HardState updated: %+v\n", ready.HardState) } for i, e := range ready.Entries { fmt.Fprintf(&buf, " New Entry[%d]: %.200s\n", i, raft.DescribeEntry(e, raftEntryFormatter)) } for i, e := range ready.CommittedEntries { fmt.Fprintf(&buf, " Committed Entry[%d]: %.200s\n", i, raft.DescribeEntry(e, raftEntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { fmt.Fprintf(&buf, " Snapshot updated: %.200s\n", ready.Snapshot.String()) } for i, m := range ready.Messages { fmt.Fprintf(&buf, " Outgoing Message[%d]: %.200s\n", i, raft.DescribeMessage(m, raftEntryFormatter)) } log.Infof(ctx, "%s raft ready\n%s", prefix, buf.String()) } }
func (s *state) logRaftReady(readyGroups map[uint64]raft.Ready) { for groupID, ready := range readyGroups { if log.V(5) { log.Infof("node %v: group %v raft ready", s.nodeID, groupID) if ready.SoftState != nil { log.Infof("SoftState updated: %+v", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { log.Infof("HardState updated: %+v", ready.HardState) } for i, e := range ready.Entries { log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } for i, e := range ready.CommittedEntries { log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { log.Infof("Snapshot updated: %.200s", ready.Snapshot.String()) } for i, m := range ready.Messages { log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, s.EntryFormatter)) } } } }
func (n *node) run() { for { select { case <-n.ticker: n.raft.Tick() case rd := <-n.raft.Ready(): n.saveToStorage(rd.HardState, rd.Entries, rd.Snapshot) n.send(rd.Messages) if !raft.IsEmptySnap(rd.Snapshot) { n.processSnapshot(rd.Snapshot) } for _, entry := range rd.CommittedEntries { n.process(entry) if entry.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange cc.Unmarshal(entry.Data) n.raft.ApplyConfChange(cc) } } n.raft.Advance() case <-n.done: return } } }
// HardState contains term, vote and commit. // Snapshot contains data and snapshot metadata. func (n *node) saveToStorage(hardState raftpb.HardState, entries []raftpb.Entry, snapshot raftpb.Snapshot) { if !raft.IsEmptySnap(snapshot) { fmt.Printf("saveToStorage snapshot: %v\n", snapshot.String()) le, err := n.store.LastIndex() if err != nil { log.Fatalf("While retrieving last index: %v\n", err) } te, err := n.store.Term(le) if err != nil { log.Fatalf("While retrieving term: %v\n", err) } fmt.Printf("%d node Term for le: %v is %v\n", n.id, le, te) if snapshot.Metadata.Index <= le { fmt.Printf("%d node ignoring snapshot. Last index: %v\n", n.id, le) return } if err := n.store.ApplySnapshot(snapshot); err != nil { log.Fatalf("Applying snapshot: %v", err) } } if !raft.IsEmptyHardState(hardState) { n.store.SetHardState(hardState) } n.store.Append(entries) }
// handleWriteReady converts a set of raft.Ready structs into a writeRequest // to be persisted, marks the group as writing and sends it to the writeTask. func (s *state) handleWriteReady(readyGroups map[uint64]raft.Ready) { if log.V(6) { log.Infof("node %v write ready, preparing request", s.nodeID) } writeRequest := newWriteRequest() for groupID, ready := range readyGroups { raftGroupID := proto.RaftID(groupID) g, ok := s.groups[raftGroupID] if !ok { if log.V(6) { log.Infof("dropping write request to group %d", groupID) } continue } g.writing = true gwr := &groupWriteRequest{} if !raft.IsEmptyHardState(ready.HardState) { gwr.state = ready.HardState } if !raft.IsEmptySnap(ready.Snapshot) { gwr.snapshot = ready.Snapshot } if len(ready.Entries) > 0 { gwr.entries = ready.Entries } writeRequest.groups[raftGroupID] = gwr } s.writeTask.in <- writeRequest }
// Saves a log entry to our Store func (n *Node) saveToStorage( ctx context.Context, raftConfig *api.RaftConfig, hardState raftpb.HardState, entries []raftpb.Entry, snapshot raftpb.Snapshot, ) (err error) { if !raft.IsEmptySnap(snapshot) { if err := n.raftLogger.SaveSnapshot(snapshot); err != nil { return ErrApplySnapshot } if err := n.raftLogger.GC(snapshot.Metadata.Index, snapshot.Metadata.Term, raftConfig.KeepOldSnapshots); err != nil { log.G(ctx).WithError(err).Error("unable to clean old snapshots and WALs") } if err = n.raftStore.ApplySnapshot(snapshot); err != nil { return ErrApplySnapshot } } if err := n.raftLogger.SaveEntries(hardState, entries); err != nil { // TODO(aaronl): These error types should really wrap more // detailed errors. return ErrApplySnapshot } if err = n.raftStore.Append(entries); err != nil { return ErrAppendEntry } return nil }
func (s *EtcdServer) applySnapshot(ep *etcdProgress, apply *apply) { if raft.IsEmptySnap(apply.snapshot) { return } if apply.snapshot.Metadata.Index <= ep.appliedi { plog.Panicf("snapshot index [%d] should > appliedi[%d] + 1", apply.snapshot.Metadata.Index, ep.appliedi) } if s.cfg.V3demo { snapfn, err := s.r.storage.DBFilePath(apply.snapshot.Metadata.Index) if err != nil { plog.Panicf("get database snapshot file path error: %v", err) } fn := path.Join(s.cfg.SnapDir(), databaseFilename) if err := os.Rename(snapfn, fn); err != nil { plog.Panicf("rename snapshot file error: %v", err) } // TODO: recover lessor newbe := backend.NewDefaultBackend(fn) if err := s.kv.Restore(newbe); err != nil { plog.Panicf("restore KV error: %v", err) } // Closing old backend might block until all the txns // on the backend are finished. // We do not want to wait on closing the old backend. oldbe := s.be go func() { if err := oldbe.Close(); err != nil { plog.Panicf("close backend error: %v", err) } }() s.be = newbe } if err := s.store.Recovery(apply.snapshot.Data); err != nil { plog.Panicf("recovery store error: %v", err) } s.cluster.Recover() // recover raft transport s.r.transport.RemoveAllPeers() for _, m := range s.cluster.Members() { if m.ID == s.ID() { continue } s.r.transport.AddPeer(m.ID, m.PeerURLs) } ep.appliedi = apply.snapshot.Metadata.Index ep.snapi = ep.appliedi ep.confState = apply.snapshot.Metadata.ConfState plog.Infof("recovered from incoming snapshot at index %d", ep.snapi) }
// Start is the main loop for a Raft node, it // goes along the state machine, acting on the // messages received from other Raft nodes in // the cluster func (n *Node) Start() { for { select { case <-n.ticker.C: n.Tick() case rd := <-n.Ready(): n.saveToStorage(rd.HardState, rd.Entries, rd.Snapshot) n.send(rd.Messages) if !raft.IsEmptySnap(rd.Snapshot) { n.processSnapshot(rd.Snapshot) } for _, entry := range rd.CommittedEntries { n.process(entry) if entry.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange err := cc.Unmarshal(entry.Data) if err != nil { log.Fatal("raft: Can't unmarshal configuration change") } switch cc.Type { case raftpb.ConfChangeAddNode: n.applyAddNode(cc) case raftpb.ConfChangeRemoveNode: n.applyRemoveNode(cc) } n.ApplyConfChange(cc) } } n.Advance() case <-n.stopChan: n.Stop() n.Node = nil close(n.stopChan) return case pause := <-n.pauseChan: // FIXME lock hell n.SetPaused(pause) for n.pause { select { case pause = <-n.pauseChan: n.SetPaused(pause) } } n.pauseLock.Lock() // process pending messages for _, m := range n.rcvmsg { err := n.Step(n.Ctx, m) if err != nil { log.Fatal("Something went wrong when unpausing the node") } } n.rcvmsg = nil n.pauseLock.Unlock() } } }
func (s *state) handleRaftReady(readyGroups map[uint64]raft.Ready) { // Soft state is updated immediately; everything else waits for handleWriteReady. for groupID, ready := range readyGroups { if log.V(5) { log.Infof("node %v: group %v raft ready", s.nodeID, groupID) if ready.SoftState != nil { log.Infof("SoftState updated: %+v", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { log.Infof("HardState updated: %+v", ready.HardState) } for i, e := range ready.Entries { log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } for i, e := range ready.CommittedEntries { log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { log.Infof("Snapshot updated: %.200s", ready.Snapshot.String()) } for i, m := range ready.Messages { log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, s.EntryFormatter)) } } g, ok := s.groups[groupID] if !ok { // This is a stale message for a removed group log.V(4).Infof("node %v: dropping stale ready message for group %v", s.nodeID, groupID) continue } term := g.committedTerm if ready.SoftState != nil { // Always save the leader whenever we get a SoftState. g.leader = NodeID(ready.SoftState.Lead) } if len(ready.CommittedEntries) > 0 { term = ready.CommittedEntries[len(ready.CommittedEntries)-1].Term } if term != g.committedTerm && g.leader != 0 { // Whenever the committed term has advanced and we know our leader, // emit an event. g.committedTerm = term s.sendEvent(&EventLeaderElection{ GroupID: groupID, NodeID: NodeID(g.leader), Term: g.committedTerm, }) // Re-submit all pending proposals for _, prop := range g.pending { s.proposalChan <- prop } } } }
// Saves a log entry to our Store func (n *Node) saveToStorage(hardState raftpb.HardState, entries []raftpb.Entry, snapshot raftpb.Snapshot) { n.Store.Append(entries) if !raft.IsEmptyHardState(hardState) { n.Store.SetHardState(hardState) } if !raft.IsEmptySnap(snapshot) { n.Store.ApplySnapshot(snapshot) } }
// start runs the storage loop in a goroutine. func (w *writeTask) start(stopper *stop.Stopper) { stopper.RunWorker(func() { for { var request *writeRequest select { case <-w.ready: continue case <-stopper.ShouldStop(): return case request = <-w.in: } if log.V(6) { log.Infof("writeTask got request %#v", *request) } response := &writeResponse{make(map[roachpb.RangeID]*groupWriteResponse)} for groupID, groupReq := range request.groups { group, err := w.storage.GroupStorage(groupID, groupReq.replicaID) if err == ErrGroupDeleted { if log.V(4) { log.Infof("dropping write to deleted group %v", groupID) } continue } else if err != nil { log.Fatalf("GroupStorage(group %s, replica %s) failed: %s", groupID, groupReq.replicaID, err) } groupResp := &groupWriteResponse{raftpb.HardState{}, -1, -1, groupReq.entries} response.groups[groupID] = groupResp if !raft.IsEmptyHardState(groupReq.state) { err := group.SetHardState(groupReq.state) if err != nil { panic(err) // TODO(bdarnell): mark this node dead on storage errors } groupResp.state = groupReq.state } if !raft.IsEmptySnap(groupReq.snapshot) { err := group.ApplySnapshot(groupReq.snapshot) if err != nil { panic(err) // TODO(bdarnell) } } if len(groupReq.entries) > 0 { err := group.Append(groupReq.entries) if err != nil { panic(err) // TODO(bdarnell) } } } w.out <- response } }) }
// handleWriteReady converts a set of raft.Ready structs into a writeRequest // to be persisted, marks the group as writing and sends it to the writeTask. // It will only do this for groups which are tagged via the map. func (s *state) handleWriteReady(checkReadyGroupIDs map[roachpb.RangeID]struct{}) map[roachpb.RangeID]raft.Ready { if log.V(6) { log.Infof("node %v write ready, preparing request", s.nodeID) } s.lockStorage() defer s.unlockStorage() writeRequest := newWriteRequest() readys := make(map[roachpb.RangeID]raft.Ready) for groupID := range checkReadyGroupIDs { g, ok := s.groups[groupID] if !ok { if log.V(6) { log.Infof("dropping write request to group %d", groupID) } continue } if !g.raftGroup.HasReady() { continue } ready := g.raftGroup.Ready() readys[groupID] = ready g.writing = true gwr := &groupWriteRequest{} var err error gwr.replicaID, err = s.Storage().ReplicaIDForStore(groupID, s.storeID) if err != nil { if log.V(1) { log.Warningf("failed to look up replica ID for range %v (disabling replica ID check): %s", groupID, err) } gwr.replicaID = 0 } if !raft.IsEmptyHardState(ready.HardState) { gwr.state = ready.HardState } if !raft.IsEmptySnap(ready.Snapshot) { gwr.snapshot = ready.Snapshot } if len(ready.Entries) > 0 { gwr.entries = ready.Entries } writeRequest.groups[groupID] = gwr } // If no ready, don't write to writeTask as caller will // not wait on s.writeTask.out when len(readys) == 0. if len(readys) > 0 { s.writeTask.in <- writeRequest } return readys }
// Store stores the snapshot, hardstate and entries for a given RAFT group. func (w *Wal) Store(gid uint32, s raftpb.Snapshot, h raftpb.HardState, es []raftpb.Entry) error { b := w.wals.NewWriteBatch() defer b.Destroy() if !raft.IsEmptySnap(s) { data, err := s.Marshal() if err != nil { return x.Wrapf(err, "wal.Store: While marshal snapshot") } b.Put(w.snapshotKey(gid), data) } if !raft.IsEmptyHardState(h) { data, err := h.Marshal() if err != nil { return x.Wrapf(err, "wal.Store: While marshal hardstate") } b.Put(w.hardStateKey(gid), data) } var t, i uint64 for _, e := range es { t, i = e.Term, e.Index data, err := e.Marshal() if err != nil { return x.Wrapf(err, "wal.Store: While marshal entry") } k := w.entryKey(gid, e.Term, e.Index) b.Put(k, data) } // If we get no entries, then the default value of t and i would be zero. That would // end up deleting all the previous valid raft entry logs. This check avoids that. if t > 0 || i > 0 { // Delete all keys above this index. start := w.entryKey(gid, t, i+1) prefix := w.prefix(gid) itr := w.wals.NewIterator() defer itr.Close() for itr.Seek(start); itr.ValidForPrefix(prefix); itr.Next() { b.Delete(itr.Key().Data()) } } err := w.wals.WriteBatch(b) return x.Wrapf(err, "wal.Store: While WriteBatch") }
func (rc *raftNode) publishSnapshot(snapshotToSave raftpb.Snapshot) { if raft.IsEmptySnap(snapshotToSave) { return } log.Printf("publishing snapshot at index %d", rc.snapshotIndex) defer log.Printf("finished publishing snapshot at index %d", rc.snapshotIndex) if snapshotToSave.Metadata.Index <= rc.appliedIndex { log.Fatalf("snapshot index [%d] should > progress.appliedIndex [%d] + 1", snapshotToSave.Metadata.Index, rc.appliedIndex) } rc.commitC <- nil // trigger kvstore to load snapshot rc.confState = snapshotToSave.Metadata.ConfState rc.snapshotIndex = snapshotToSave.Metadata.Index rc.appliedIndex = snapshotToSave.Metadata.Index }
// start runs the storage loop. Blocks until stopped, so should be run in a goroutine. func (w *writeTask) start() { for { var request *writeRequest select { case <-w.ready: continue case <-w.stopper.ShouldStop(): w.stopper.SetStopped() return case request = <-w.in: } log.V(6).Infof("writeTask got request %#v", *request) response := &writeResponse{make(map[uint64]*groupWriteResponse)} for groupID, groupReq := range request.groups { group := w.storage.GroupStorage(groupID) if group == nil { log.V(4).Infof("dropping write to group %v", groupID) continue } groupResp := &groupWriteResponse{raftpb.HardState{}, -1, -1, groupReq.entries} response.groups[groupID] = groupResp if !raft.IsEmptyHardState(groupReq.state) { err := group.SetHardState(groupReq.state) if err != nil { panic(err) // TODO(bdarnell): mark this node dead on storage errors } groupResp.state = groupReq.state } if !raft.IsEmptySnap(groupReq.snapshot) { err := group.ApplySnapshot(groupReq.snapshot) if err != nil { panic(err) // TODO(bdarnell) } } if len(groupReq.entries) > 0 { err := group.Append(groupReq.entries) if err != nil { panic(err) // TODO(bdarnell) } } } w.out <- response } }
func (s *state) handleWriteReady(readyGroups map[uint64]raft.Ready) { log.V(6).Infof("node %v write ready, preparing request", s.nodeID) writeRequest := newWriteRequest() for groupID, ready := range readyGroups { gwr := &groupWriteRequest{} if !raft.IsEmptyHardState(ready.HardState) { gwr.state = ready.HardState } if !raft.IsEmptySnap(ready.Snapshot) { gwr.snapshot = ready.Snapshot } if len(ready.Entries) > 0 { gwr.entries = ready.Entries } writeRequest.groups[groupID] = gwr } s.writeTask.in <- writeRequest }
func (n *node) initFromWal(wal *raftwal.Wal) (restart bool, rerr error) { n.wal = wal var sp raftpb.Snapshot sp, rerr = wal.Snapshot(n.gid) if rerr != nil { return } var term, idx uint64 if !raft.IsEmptySnap(sp) { fmt.Printf("Found Snapshot: %+v\n", sp) restart = true if rerr = n.store.ApplySnapshot(sp); rerr != nil { return } term = sp.Metadata.Term idx = sp.Metadata.Index } var hd raftpb.HardState hd, rerr = wal.HardState(n.gid) if rerr != nil { return } if !raft.IsEmptyHardState(hd) { fmt.Printf("Found hardstate: %+v\n", sp) restart = true if rerr = n.store.SetHardState(hd); rerr != nil { return } } var es []raftpb.Entry es, rerr = wal.Entries(n.gid, term, idx) if rerr != nil { return } fmt.Printf("Found %d entries\n", len(es)) if len(es) > 0 { restart = true } rerr = n.store.Append(es) return }
func save(rd raft.Ready, st *raft.MemoryStorage) error { if !raft.IsEmptyHardState(rd.HardState) { if err := st.SetHardState(rd.HardState); err != nil { return err } } if len(rd.Entries) > 0 { if err := st.Append(rd.Entries); err != nil { return err } } if !raft.IsEmptySnap(rd.Snapshot) { if err := st.ApplySnapshot(rd.Snapshot); err != nil { return err } } return nil }
func (c *ctrl) readySave(snapshot raftpb.Snapshot, hardState raftpb.HardState, entries []raftpb.Entry) error { // For the moment, none of these steps persist to disk. That violates some Raft // invariants. But we are ephemeral, and will always boot empty, willingly // paying the snapshot cost. I trust that that the etcd Raft implementation // permits this. if !raft.IsEmptySnap(snapshot) { if err := c.storage.ApplySnapshot(snapshot); err != nil { return fmt.Errorf("apply snapshot: %v", err) } } if !raft.IsEmptyHardState(hardState) { if err := c.storage.SetHardState(hardState); err != nil { return fmt.Errorf("set hard state: %v", err) } } if err := c.storage.Append(entries); err != nil { return fmt.Errorf("append: %v", err) } return nil }
func (n *node) Run() { firstRun := true ticker := time.NewTicker(time.Second) for { select { case <-ticker.C: n.raft.Tick() case rd := <-n.raft.Ready(): x.Check(n.wal.Store(n.gid, rd.Snapshot, rd.HardState, rd.Entries)) n.saveToStorage(rd.Snapshot, rd.HardState, rd.Entries) rcBytes, err := n.raftContext.Marshal() for _, msg := range rd.Messages { // NOTE: We can do some optimizations here to drop messages. x.Check(err) msg.Context = rcBytes n.send(msg) } if !raft.IsEmptySnap(rd.Snapshot) { n.processSnapshot(rd.Snapshot) } if len(rd.CommittedEntries) > 0 { x.Trace(n.ctx, "Found %d committed entries", len(rd.CommittedEntries)) } for _, entry := range rd.CommittedEntries { // Just queue up to be processed. Don't wait on them. n.commitCh <- entry } n.raft.Advance() if firstRun && n.canCampaign { go n.raft.Campaign(n.ctx) firstRun = false } case <-n.done: return } } }
// handleWriteReady converts a set of raft.Ready structs into a writeRequest // to be persisted, marks the group as writing and sends it to the writeTask. func (s *state) handleWriteReady() { if log.V(6) { log.Infof("node %v write ready, preparing request", s.nodeID) } s.lockStorage() defer s.unlockStorage() writeRequest := newWriteRequest() for groupID, ready := range s.readyGroups { raftGroupID := roachpb.RangeID(groupID) g, ok := s.groups[raftGroupID] if !ok { if log.V(6) { log.Infof("dropping write request to group %d", groupID) } continue } g.writing = true gwr := &groupWriteRequest{} var err error gwr.replicaID, err = s.Storage().ReplicaIDForStore(roachpb.RangeID(groupID), s.storeID) if err != nil { if log.V(1) { log.Warningf("failed to look up replica ID for range %v (disabling replica ID check): %s", groupID, err) } gwr.replicaID = 0 } if !raft.IsEmptyHardState(ready.HardState) { gwr.state = ready.HardState } if !raft.IsEmptySnap(ready.Snapshot) { gwr.snapshot = ready.Snapshot } if len(ready.Entries) > 0 { gwr.entries = ready.Entries } writeRequest.groups[raftGroupID] = gwr } s.writeTask.in <- writeRequest }
func (n *node) saveToStorage(s raftpb.Snapshot, h raftpb.HardState, es []raftpb.Entry) { if !raft.IsEmptySnap(s) { le, err := n.store.LastIndex() if err != nil { log.Fatalf("While retrieving last index: %v\n", err) } if s.Metadata.Index <= le { return } if err := n.store.ApplySnapshot(s); err != nil { log.Fatalf("Applying snapshot: %v", err) } } if !raft.IsEmptyHardState(h) { n.store.SetHardState(h) } n.store.Append(es) }
// Saves a log entry to our Store func (n *Node) saveToStorage(raftConfig *api.RaftConfig, hardState raftpb.HardState, entries []raftpb.Entry, snapshot raftpb.Snapshot) (err error) { if !raft.IsEmptySnap(snapshot) { if err := n.saveSnapshot(snapshot, raftConfig.KeepOldSnapshots); err != nil { return ErrApplySnapshot } if err = n.raftStore.ApplySnapshot(snapshot); err != nil { return ErrApplySnapshot } } if err := n.wal.Save(hardState, entries); err != nil { // TODO(aaronl): These error types should really wrap more // detailed errors. return ErrApplySnapshot } if err = n.raftStore.Append(entries); err != nil { return ErrAppendEntry } return nil }
func (n *node) run() { for { select { case <-time.Tick(time.Second): n.raft.Tick() case rd := <-n.raft.Ready(): n.saveToStorage(rd.HardState, rd.Entries, rd.Snapshot) n.send(rd.Messages) if !raft.IsEmptySnap(rd.Snapshot) { fmt.Println("Applying snapshot to state machine") n.applyToStateMachine(rd.Snapshot) } if len(rd.CommittedEntries) > 0 { fmt.Printf("Node: %v. Got %d committed entries\n", n.id, len(rd.CommittedEntries)) } for _, entry := range rd.CommittedEntries { n.process(entry) } n.raft.Advance() case <-n.done: return } } }
func (s *EtcdServer) applySnapshot(ep *etcdProgress, apply *apply) { if raft.IsEmptySnap(apply.snapshot) { return } plog.Infof("applying snapshot at index %d...", ep.snapi) defer plog.Infof("finished applying incoming snapshot at index %d", ep.snapi) if apply.snapshot.Metadata.Index <= ep.appliedi { plog.Panicf("snapshot index [%d] should > appliedi[%d] + 1", apply.snapshot.Metadata.Index, ep.appliedi) } snapfn, err := s.r.storage.DBFilePath(apply.snapshot.Metadata.Index) if err != nil { plog.Panicf("get database snapshot file path error: %v", err) } fn := path.Join(s.Cfg.SnapDir(), databaseFilename) if err := os.Rename(snapfn, fn); err != nil { plog.Panicf("rename snapshot file error: %v", err) } newbe := backend.NewDefaultBackend(fn) // always recover lessor before kv. When we recover the mvcc.KV it will reattach keys to its leases. // If we recover mvcc.KV first, it will attach the keys to the wrong lessor before it recovers. if s.lessor != nil { plog.Info("recovering lessor...") s.lessor.Recover(newbe, s.kv) plog.Info("finished recovering lessor") } plog.Info("restoring mvcc store...") if err := s.kv.Restore(newbe); err != nil { plog.Panicf("restore KV error: %v", err) } s.consistIndex.setConsistentIndex(s.kv.ConsistentIndex()) plog.Info("finished restoring mvcc store") // Closing old backend might block until all the txns // on the backend are finished. // We do not want to wait on closing the old backend. s.bemu.Lock() oldbe := s.be go func() { plog.Info("closing old backend...") defer plog.Info("finished closing old backend") if err := oldbe.Close(); err != nil { plog.Panicf("close backend error: %v", err) } }() s.be = newbe s.bemu.Unlock() plog.Info("recovering alarms...") if err := s.restoreAlarms(); err != nil { plog.Panicf("restore alarms error: %v", err) } plog.Info("finished recovering alarms") if s.authStore != nil { plog.Info("recovering auth store...") s.authStore.Recover(newbe) plog.Info("finished recovering auth store") } plog.Info("recovering store v2...") if err := s.store.Recovery(apply.snapshot.Data); err != nil { plog.Panicf("recovery store error: %v", err) } plog.Info("finished recovering store v2") s.cluster.SetBackend(s.be) plog.Info("recovering cluster configuration...") s.cluster.Recover(api.UpdateCapability) plog.Info("finished recovering cluster configuration") plog.Info("removing old peers from network...") // recover raft transport s.r.transport.RemoveAllPeers() plog.Info("finished removing old peers from network") plog.Info("adding peers from new cluster configuration into network...") for _, m := range s.cluster.Members() { if m.ID == s.ID() { continue } s.r.transport.AddPeer(m.ID, m.PeerURLs) } plog.Info("finished adding peers from new cluster configuration into network...") ep.appliedi = apply.snapshot.Metadata.Index ep.snapi = ep.appliedi ep.confState = apply.snapshot.Metadata.ConfState }
// handleWriteResponse updates the state machine and sends messages for a raft Ready batch. func (s *state) handleWriteResponse(response *writeResponse, readyGroups map[uint64]raft.Ready) { if log.V(6) { log.Infof("node %v got write response: %#v", s.nodeID, *response) } // Everything has been written to disk; now we can apply updates to the state machine // and send outgoing messages. for groupID, ready := range readyGroups { raftGroupID := proto.RaftID(groupID) g, ok := s.groups[raftGroupID] if !ok { if log.V(4) { log.Infof("dropping stale write to group %v", groupID) } continue } else if !g.writing { if log.V(4) { log.Infof("dropping stale write to reincarnation of group %v", groupID) } delete(readyGroups, groupID) // they must not make it to Advance. continue } g.writing = false // Process committed entries. for _, entry := range ready.CommittedEntries { commandID := s.processCommittedEntry(raftGroupID, g, entry) // TODO(bdarnell): the command is now committed, but not applied until the // application consumes EventCommandCommitted. Is returning via the channel // at this point useful or do we need to wait for the command to be // applied too? // This could be done with a Callback as in EventMembershipChangeCommitted // or perhaps we should move away from a channel to a callback-based system. s.removePending(g, g.pending[commandID], nil /* err */) } if !raft.IsEmptySnap(ready.Snapshot) { // Sync the group/node mapping with the information contained in the snapshot. for _, nodeID := range ready.Snapshot.Metadata.ConfState.Nodes { // TODO(bdarnell): if we had any information that predated this snapshot // we must remove those nodes. if err := s.addNode(proto.RaftNodeID(nodeID), raftGroupID); err != nil { log.Errorf("node %v: error adding node %v", s.nodeID, nodeID) } } } // Process SoftState and leader changes. s.maybeSendLeaderEvent(raftGroupID, g, &ready) // Send all messages. for _, msg := range ready.Messages { switch msg.Type { case raftpb.MsgHeartbeat: if log.V(8) { log.Infof("node %v dropped individual heartbeat to node %v", s.nodeID, msg.To) } case raftpb.MsgHeartbeatResp: if log.V(8) { log.Infof("node %v dropped individual heartbeat response to node %v", s.nodeID, msg.To) } default: s.sendMessage(raftGroupID, msg) } } } }
func (s *EtcdServer) run() { var syncC <-chan time.Time var shouldstop bool shouldstopC := s.sendhub.ShouldStopNotify() // load initial state from raft storage snap, err := s.raftStorage.Snapshot() if err != nil { log.Panicf("etcdserver: get snapshot from raft storage error: %v", err) } // snapi indicates the index of the last submitted snapshot request snapi := snap.Metadata.Index appliedi := snap.Metadata.Index confState := snap.Metadata.ConfState defer func() { s.node.Stop() s.sendhub.Stop() if err := s.storage.Close(); err != nil { log.Panicf("etcdserver: close storage error: %v", err) } close(s.done) }() for { select { case <-s.Ticker: s.node.Tick() case rd := <-s.node.Ready(): if rd.SoftState != nil { atomic.StoreUint64(&s.raftLead, rd.SoftState.Lead) if rd.RaftState == raft.StateLeader { syncC = s.SyncTicker // TODO: remove the nil checking // current test utility does not provide the stats if s.stats != nil { s.stats.BecomeLeader() } } else { syncC = nil } } // apply snapshot to storage if it is more updated than current snapi if !raft.IsEmptySnap(rd.Snapshot) && rd.Snapshot.Metadata.Index > snapi { if err := s.storage.SaveSnap(rd.Snapshot); err != nil { log.Fatalf("etcdserver: save snapshot error: %v", err) } s.raftStorage.ApplySnapshot(rd.Snapshot) snapi = rd.Snapshot.Metadata.Index log.Printf("etcdserver: saved incoming snapshot at index %d", snapi) } if err := s.storage.Save(rd.HardState, rd.Entries); err != nil { log.Fatalf("etcdserver: save state and entries error: %v", err) } s.raftStorage.Append(rd.Entries) s.sendhub.Send(rd.Messages) // recover from snapshot if it is more updated than current applied if !raft.IsEmptySnap(rd.Snapshot) && rd.Snapshot.Metadata.Index > appliedi { if err := s.store.Recovery(rd.Snapshot.Data); err != nil { log.Panicf("recovery store error: %v", err) } s.Cluster.Recover() appliedi = rd.Snapshot.Metadata.Index log.Printf("etcdserver: recovered from incoming snapshot at index %d", snapi) } // TODO(bmizerany): do this in the background, but take // care to apply entries in a single goroutine, and not // race them. if len(rd.CommittedEntries) != 0 { firsti := rd.CommittedEntries[0].Index if firsti > appliedi+1 { log.Panicf("etcdserver: first index of committed entry[%d] should <= appliedi[%d] + 1", firsti, appliedi) } var ents []raftpb.Entry if appliedi+1-firsti < uint64(len(rd.CommittedEntries)) { ents = rd.CommittedEntries[appliedi+1-firsti:] } if len(ents) > 0 { if appliedi, shouldstop = s.apply(ents, &confState); shouldstop { return } } } s.node.Advance() if appliedi-snapi > s.snapCount { log.Printf("etcdserver: start to snapshot (applied: %d, lastsnap: %d)", appliedi, snapi) s.snapshot(appliedi, &confState) snapi = appliedi } case <-syncC: s.sync(defaultSyncTimeout) case <-shouldstopC: return case <-s.stop: return } } }