func (s *state) logRaftReady(readyGroups map[uint64]raft.Ready) { for groupID, ready := range readyGroups { if log.V(5) { log.Infof("node %v: group %v raft ready", s.nodeID, groupID) if ready.SoftState != nil { log.Infof("SoftState updated: %+v", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { log.Infof("HardState updated: %+v", ready.HardState) } for i, e := range ready.Entries { log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } for i, e := range ready.CommittedEntries { log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { log.Infof("Snapshot updated: %.200s", ready.Snapshot.String()) } for i, m := range ready.Messages { log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, s.EntryFormatter)) } } } }
// handleWriteReady converts a set of raft.Ready structs into a writeRequest // to be persisted, marks the group as writing and sends it to the writeTask. func (s *state) handleWriteReady(readyGroups map[uint64]raft.Ready) { if log.V(6) { log.Infof("node %v write ready, preparing request", s.nodeID) } writeRequest := newWriteRequest() for groupID, ready := range readyGroups { raftGroupID := proto.RaftID(groupID) g, ok := s.groups[raftGroupID] if !ok { if log.V(6) { log.Infof("dropping write request to group %d", groupID) } continue } g.writing = true gwr := &groupWriteRequest{} if !raft.IsEmptyHardState(ready.HardState) { gwr.state = ready.HardState } if !raft.IsEmptySnap(ready.Snapshot) { gwr.snapshot = ready.Snapshot } if len(ready.Entries) > 0 { gwr.entries = ready.Entries } writeRequest.groups[raftGroupID] = gwr } s.writeTask.in <- writeRequest }
func (n *Node) start() { tk := time.Tick(5 * time.Millisecond) for { select { case <-tk: n.Tick() case rd := <-n.Ready(): if !raft.IsEmptyHardState(rd.HardState) { n.state = rd.HardState n.storage.SetHardState(n.state) } n.storage.Append(rd.Entries) n.send(rd.Messages) if !raft.IsEmptySnap(rd.Snapshot) { n.storage.ApplySnapshot(rd.Snapshot) } time.Sleep(time.Millisecond) for _, entry := range rd.CommittedEntries { n.process(entry) // if entry.Type == raftpb.EntryConfChange { // } // var cc raftpb.ConfChange // cc.Unmarshal(entry.Data) // n.ApplyConfChange(cc) } n.Advance() case m := <-n.receive(): n.Step(context.TODO(), m) } } }
func logRaftReady(storeID roachpb.StoreID, groupID roachpb.RangeID, ready raft.Ready) { if log.V(5) { // Globally synchronize to avoid interleaving different sets of logs in tests. logRaftReadyMu.Lock() defer logRaftReadyMu.Unlock() log.Infof("store %s: group %s raft ready", storeID, groupID) if ready.SoftState != nil { log.Infof("SoftState updated: %+v", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { log.Infof("HardState updated: %+v", ready.HardState) } for i, e := range ready.Entries { log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, raftEntryFormatter)) } for i, e := range ready.CommittedEntries { log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, raftEntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { log.Infof("Snapshot updated: %.200s", ready.Snapshot.String()) } for i, m := range ready.Messages { log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, raftEntryFormatter)) } } }
func (n *node) start() { n.stopc = make(chan struct{}) ticker := time.Tick(5 * time.Millisecond) go func() { for { select { case <-ticker: n.Tick() case rd := <-n.Ready(): if !raft.IsEmptyHardState(rd.HardState) { n.state = rd.HardState n.storage.SetHardState(n.state) } n.storage.Append(rd.Entries) go func() { for _, m := range rd.Messages { n.iface.send(m) } }() n.Advance() case m := <-n.iface.recv(): n.Step(context.TODO(), m) case <-n.stopc: n.Stop() log.Printf("raft.%d: stop", n.id) n.Node = nil close(n.stopc) return } } }() }
func updateHardState(eng engine.ReadWriter, s storagebase.ReplicaState) error { // Load a potentially existing HardState as we may need to preserve // information about cast votes. For example, during a Split for which // another node's new right-hand side has contacted us before our left-hand // side called in here to create the group. rangeID := s.Desc.RangeID oldHS, err := loadHardState(eng, rangeID) if err != nil { return err } newHS := raftpb.HardState{ Term: s.TruncatedState.Term, Commit: s.RaftAppliedIndex, } if !raft.IsEmptyHardState(oldHS) { if oldHS.Commit > newHS.Commit { newHS.Commit = oldHS.Commit } if oldHS.Term > newHS.Term { newHS.Term = oldHS.Term } newHS.Vote = oldHS.Vote } return setHardState(eng, rangeID, newHS) }
// Don't call this multiple times concurrently func (s *raftStorage) save(state raftpb.HardState, entries []raftpb.Entry) error { wb := s.db.NewBatch() if !raft.IsEmptyHardState(state) { stateBytes, err := state.Marshal() if err != nil { return err } wb.Put(s.hardStateKey, stateBytes) } if len(entries) > 0 { lastIndex, err := s.LastIndex() if err != nil { return err } if entries[0].Index > lastIndex+1 { panic(fmt.Errorf("missing log entries [last: %d, append at: %d]", lastIndex, entries[0].Index)) } // clear all old entries past the new index, if any for ix := entries[0].Index; ix <= lastIndex; ix++ { wb.Delete(s.getEntryKey(ix)) } // append the new entries for _, entry := range entries { entryBytes, err := entry.Marshal() if err != nil { return err } wb.Put(s.getEntryKey(entry.Index), entryBytes) } } err := s.db.Write(wb) return err }
// HardState contains term, vote and commit. // Snapshot contains data and snapshot metadata. func (n *node) saveToStorage(hardState raftpb.HardState, entries []raftpb.Entry, snapshot raftpb.Snapshot) { if !raft.IsEmptySnap(snapshot) { fmt.Printf("saveToStorage snapshot: %v\n", snapshot.String()) le, err := n.store.LastIndex() if err != nil { log.Fatalf("While retrieving last index: %v\n", err) } te, err := n.store.Term(le) if err != nil { log.Fatalf("While retrieving term: %v\n", err) } fmt.Printf("%d node Term for le: %v is %v\n", n.id, le, te) if snapshot.Metadata.Index <= le { fmt.Printf("%d node ignoring snapshot. Last index: %v\n", n.id, le) return } if err := n.store.ApplySnapshot(snapshot); err != nil { log.Fatalf("Applying snapshot: %v", err) } } if !raft.IsEmptyHardState(hardState) { n.store.SetHardState(hardState) } n.store.Append(entries) }
func logRaftReady(ctx context.Context, prefix fmt.Stringer, ready raft.Ready) { if log.V(5) { var buf bytes.Buffer if ready.SoftState != nil { fmt.Fprintf(&buf, " SoftState updated: %+v\n", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { fmt.Fprintf(&buf, " HardState updated: %+v\n", ready.HardState) } for i, e := range ready.Entries { fmt.Fprintf(&buf, " New Entry[%d]: %.200s\n", i, raft.DescribeEntry(e, raftEntryFormatter)) } for i, e := range ready.CommittedEntries { fmt.Fprintf(&buf, " Committed Entry[%d]: %.200s\n", i, raft.DescribeEntry(e, raftEntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { fmt.Fprintf(&buf, " Snapshot updated: %.200s\n", ready.Snapshot.String()) } for i, m := range ready.Messages { fmt.Fprintf(&buf, " Outgoing Message[%d]: %.200s\n", i, raft.DescribeMessage(m, raftEntryFormatter)) } log.Infof(ctx, "%s raft ready\n%s", prefix, buf.String()) } }
func (w *WAL) Save(st raftpb.HardState, ents []raftpb.Entry) error { w.mu.Lock() defer w.mu.Unlock() // short cut, do not call sync if raft.IsEmptyHardState(st) && len(ents) == 0 { return nil } // TODO(xiangli): no more reference operator for i := range ents { if err := w.saveEntry(&ents[i]); err != nil { return err } } if err := w.saveState(&st); err != nil { return err } fstat, err := w.f.Stat() if err != nil { return err } if fstat.Size() < segmentSizeBytes { return w.sync() } // TODO: add a test for this code path when refactoring the tests return w.cut() }
func (w *WAL) Save(st raftpb.HardState, ents []raftpb.Entry) error { w.mu.Lock() defer w.mu.Unlock() // short cut, do not call sync if raft.IsEmptyHardState(st) && len(ents) == 0 { return nil } mustSync := mustSync(st, w.state, len(ents)) // TODO(xiangli): no more reference operator for i := range ents { if err := w.saveEntry(&ents[i]); err != nil { return err } } if err := w.saveState(&st); err != nil { return err } curOff, err := w.tail().Seek(0, os.SEEK_CUR) if err != nil { return err } if curOff < SegmentSizeBytes { if mustSync { return w.sync() } return nil } // TODO: add a test for this code path when refactoring the tests return w.cut() }
func (w *WAL) SaveState(s *raftpb.HardState) error { if raft.IsEmptyHardState(*s) { return nil } b := pbutil.MustMarshal(s) rec := &walpb.Record{Type: stateType, Data: b} return w.encoder.encode(rec) }
func (s *state) handleRaftReady(readyGroups map[uint64]raft.Ready) { // Soft state is updated immediately; everything else waits for handleWriteReady. for groupID, ready := range readyGroups { if log.V(5) { log.Infof("node %v: group %v raft ready", s.nodeID, groupID) if ready.SoftState != nil { log.Infof("SoftState updated: %+v", *ready.SoftState) } if !raft.IsEmptyHardState(ready.HardState) { log.Infof("HardState updated: %+v", ready.HardState) } for i, e := range ready.Entries { log.Infof("New Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } for i, e := range ready.CommittedEntries { log.Infof("Committed Entry[%d]: %.200s", i, raft.DescribeEntry(e, s.EntryFormatter)) } if !raft.IsEmptySnap(ready.Snapshot) { log.Infof("Snapshot updated: %.200s", ready.Snapshot.String()) } for i, m := range ready.Messages { log.Infof("Outgoing Message[%d]: %.200s", i, raft.DescribeMessage(m, s.EntryFormatter)) } } g, ok := s.groups[groupID] if !ok { // This is a stale message for a removed group log.V(4).Infof("node %v: dropping stale ready message for group %v", s.nodeID, groupID) continue } term := g.committedTerm if ready.SoftState != nil { // Always save the leader whenever we get a SoftState. g.leader = NodeID(ready.SoftState.Lead) } if len(ready.CommittedEntries) > 0 { term = ready.CommittedEntries[len(ready.CommittedEntries)-1].Term } if term != g.committedTerm && g.leader != 0 { // Whenever the committed term has advanced and we know our leader, // emit an event. g.committedTerm = term s.sendEvent(&EventLeaderElection{ GroupID: groupID, NodeID: NodeID(g.leader), Term: g.committedTerm, }) // Re-submit all pending proposals for _, prop := range g.pending { s.proposalChan <- prop } } } }
// writeInitialState bootstraps a new Raft group (i.e. it is called when we // bootstrap a Range, or when setting up the right hand side of a split). // Its main task is to persist a consistent Raft (and associated Replica) state // which does not start from zero but presupposes a few entries already having // applied. // The supplied MVCCStats are used for the Stats field after adjusting for // persisting the state itself, and the updated stats are returned. func writeInitialState( eng engine.ReadWriter, ms enginepb.MVCCStats, desc roachpb.RangeDescriptor, ) (enginepb.MVCCStats, error) { rangeID := desc.RangeID var s storagebase.ReplicaState s.TruncatedState = &roachpb.RaftTruncatedState{ Term: raftInitialLogTerm, Index: raftInitialLogIndex, } s.RaftAppliedIndex = s.TruncatedState.Index s.Desc = &roachpb.RangeDescriptor{ RangeID: rangeID, } s.Stats = ms newMS, err := saveState(eng, s) if err != nil { return enginepb.MVCCStats{}, err } // Load a potentially existing HardState as we may need to preserve // information about cast votes. For example, during a Split for which // another node's new right-hand side has contacted us before our left-hand // side called in here to create the group. oldHS, err := loadHardState(eng, rangeID) if err != nil { return enginepb.MVCCStats{}, err } newHS := raftpb.HardState{ Term: s.TruncatedState.Term, Commit: s.TruncatedState.Index, } if !raft.IsEmptyHardState(oldHS) { if oldHS.Commit > newHS.Commit { newHS.Commit = oldHS.Commit } if oldHS.Term > newHS.Term { newHS.Term = oldHS.Term } newHS.Vote = oldHS.Vote } if err := setHardState(eng, rangeID, newHS); err != nil { return enginepb.MVCCStats{}, err } if err := setLastIndex(eng, rangeID, s.TruncatedState.Index); err != nil { return enginepb.MVCCStats{}, err } return newMS, nil }
// Saves a log entry to our Store func (n *Node) saveToStorage(hardState raftpb.HardState, entries []raftpb.Entry, snapshot raftpb.Snapshot) { n.Store.Append(entries) if !raft.IsEmptyHardState(hardState) { n.Store.SetHardState(hardState) } if !raft.IsEmptySnap(snapshot) { n.Store.ApplySnapshot(snapshot) } }
func (w *WAL) SaveState(s *raftpb.HardState) error { if raft.IsEmptyHardState(*s) { return nil } b, err := s.Marshal() if err != nil { panic(err) } rec := &walpb.Record{Type: stateType, Data: b} return w.encoder.encode(rec) }
func (w *WAL) SaveState(s *raftpb.HardState) error { if raft.IsEmptyHardState(*s) { return nil } log.Printf("path=%s wal.saveState state=\"%+v\"", w.f.Name(), s) b, err := s.Marshal() if err != nil { panic(err) } rec := &walpb.Record{Type: stateType, Data: b} return w.encoder.encode(rec) }
// start runs the storage loop in a goroutine. func (w *writeTask) start(stopper *stop.Stopper) { stopper.RunWorker(func() { for { var request *writeRequest select { case <-w.ready: continue case <-stopper.ShouldStop(): return case request = <-w.in: } if log.V(6) { log.Infof("writeTask got request %#v", *request) } response := &writeResponse{make(map[roachpb.RangeID]*groupWriteResponse)} for groupID, groupReq := range request.groups { group, err := w.storage.GroupStorage(groupID, groupReq.replicaID) if err == ErrGroupDeleted { if log.V(4) { log.Infof("dropping write to deleted group %v", groupID) } continue } else if err != nil { log.Fatalf("GroupStorage(group %s, replica %s) failed: %s", groupID, groupReq.replicaID, err) } groupResp := &groupWriteResponse{raftpb.HardState{}, -1, -1, groupReq.entries} response.groups[groupID] = groupResp if !raft.IsEmptyHardState(groupReq.state) { err := group.SetHardState(groupReq.state) if err != nil { panic(err) // TODO(bdarnell): mark this node dead on storage errors } groupResp.state = groupReq.state } if !raft.IsEmptySnap(groupReq.snapshot) { err := group.ApplySnapshot(groupReq.snapshot) if err != nil { panic(err) // TODO(bdarnell) } } if len(groupReq.entries) > 0 { err := group.Append(groupReq.entries) if err != nil { panic(err) // TODO(bdarnell) } } } w.out <- response } }) }
// InitialState implements the raft.Storage interface. // InitialState requires that the replica lock be held. func (r *Replica) InitialState() (raftpb.HardState, raftpb.ConfState, error) { hs, err := loadHardState(context.Background(), r.store.Engine(), r.RangeID) // For uninitialized ranges, membership is unknown at this point. if raft.IsEmptyHardState(hs) || err != nil { return raftpb.HardState{}, raftpb.ConfState{}, err } var cs raftpb.ConfState for _, rep := range r.mu.state.Desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } return hs, cs, nil }
// handleWriteReady converts a set of raft.Ready structs into a writeRequest // to be persisted, marks the group as writing and sends it to the writeTask. // It will only do this for groups which are tagged via the map. func (s *state) handleWriteReady(checkReadyGroupIDs map[roachpb.RangeID]struct{}) map[roachpb.RangeID]raft.Ready { if log.V(6) { log.Infof("node %v write ready, preparing request", s.nodeID) } s.lockStorage() defer s.unlockStorage() writeRequest := newWriteRequest() readys := make(map[roachpb.RangeID]raft.Ready) for groupID := range checkReadyGroupIDs { g, ok := s.groups[groupID] if !ok { if log.V(6) { log.Infof("dropping write request to group %d", groupID) } continue } if !g.raftGroup.HasReady() { continue } ready := g.raftGroup.Ready() readys[groupID] = ready g.writing = true gwr := &groupWriteRequest{} var err error gwr.replicaID, err = s.Storage().ReplicaIDForStore(groupID, s.storeID) if err != nil { if log.V(1) { log.Warningf("failed to look up replica ID for range %v (disabling replica ID check): %s", groupID, err) } gwr.replicaID = 0 } if !raft.IsEmptyHardState(ready.HardState) { gwr.state = ready.HardState } if !raft.IsEmptySnap(ready.Snapshot) { gwr.snapshot = ready.Snapshot } if len(ready.Entries) > 0 { gwr.entries = ready.Entries } writeRequest.groups[groupID] = gwr } // If no ready, don't write to writeTask as caller will // not wait on s.writeTask.out when len(readys) == 0. if len(readys) > 0 { s.writeTask.in <- writeRequest } return readys }
func (n *node) start() { n.stopc = make(chan struct{}) ticker := time.Tick(5 * time.Millisecond) go func() { for { select { case <-ticker: n.Tick() case rd := <-n.Ready(): if !raft.IsEmptyHardState(rd.HardState) { n.mu.Lock() n.state = rd.HardState n.mu.Unlock() n.storage.SetHardState(n.state) } n.storage.Append(rd.Entries) time.Sleep(time.Millisecond) // TODO: make send async, more like real world... for _, m := range rd.Messages { n.iface.send(m) } n.Advance() case m := <-n.iface.recv(): go n.Step(context.TODO(), m) case <-n.stopc: n.Stop() log.Printf("raft.%d: stop", n.id) n.Node = nil close(n.stopc) return case p := <-n.pausec: recvms := make([]raftpb.Message, 0) for p { select { case m := <-n.iface.recv(): recvms = append(recvms, m) case p = <-n.pausec: } } // step all pending messages for _, m := range recvms { n.Step(context.TODO(), m) } } } }() }
// Store stores the snapshot, hardstate and entries for a given RAFT group. func (w *Wal) Store(gid uint32, s raftpb.Snapshot, h raftpb.HardState, es []raftpb.Entry) error { b := w.wals.NewWriteBatch() defer b.Destroy() if !raft.IsEmptySnap(s) { data, err := s.Marshal() if err != nil { return x.Wrapf(err, "wal.Store: While marshal snapshot") } b.Put(w.snapshotKey(gid), data) } if !raft.IsEmptyHardState(h) { data, err := h.Marshal() if err != nil { return x.Wrapf(err, "wal.Store: While marshal hardstate") } b.Put(w.hardStateKey(gid), data) } var t, i uint64 for _, e := range es { t, i = e.Term, e.Index data, err := e.Marshal() if err != nil { return x.Wrapf(err, "wal.Store: While marshal entry") } k := w.entryKey(gid, e.Term, e.Index) b.Put(k, data) } // If we get no entries, then the default value of t and i would be zero. That would // end up deleting all the previous valid raft entry logs. This check avoids that. if t > 0 || i > 0 { // Delete all keys above this index. start := w.entryKey(gid, t, i+1) prefix := w.prefix(gid) itr := w.wals.NewIterator() defer itr.Close() for itr.Seek(start); itr.ValidForPrefix(prefix); itr.Next() { b.Delete(itr.Key().Data()) } } err := w.wals.WriteBatch(b) return x.Wrapf(err, "wal.Store: While WriteBatch") }
// start runs the storage loop. Blocks until stopped, so should be run in a goroutine. func (w *writeTask) start() { for { var request *writeRequest select { case <-w.ready: continue case <-w.stopper.ShouldStop(): w.stopper.SetStopped() return case request = <-w.in: } log.V(6).Infof("writeTask got request %#v", *request) response := &writeResponse{make(map[uint64]*groupWriteResponse)} for groupID, groupReq := range request.groups { group := w.storage.GroupStorage(groupID) if group == nil { log.V(4).Infof("dropping write to group %v", groupID) continue } groupResp := &groupWriteResponse{raftpb.HardState{}, -1, -1, groupReq.entries} response.groups[groupID] = groupResp if !raft.IsEmptyHardState(groupReq.state) { err := group.SetHardState(groupReq.state) if err != nil { panic(err) // TODO(bdarnell): mark this node dead on storage errors } groupResp.state = groupReq.state } if !raft.IsEmptySnap(groupReq.snapshot) { err := group.ApplySnapshot(groupReq.snapshot) if err != nil { panic(err) // TODO(bdarnell) } } if len(groupReq.entries) > 0 { err := group.Append(groupReq.entries) if err != nil { panic(err) // TODO(bdarnell) } } } w.out <- response } }
func (s *state) handleWriteReady(readyGroups map[uint64]raft.Ready) { log.V(6).Infof("node %v write ready, preparing request", s.nodeID) writeRequest := newWriteRequest() for groupID, ready := range readyGroups { gwr := &groupWriteRequest{} if !raft.IsEmptyHardState(ready.HardState) { gwr.state = ready.HardState } if !raft.IsEmptySnap(ready.Snapshot) { gwr.snapshot = ready.Snapshot } if len(ready.Entries) > 0 { gwr.entries = ready.Entries } writeRequest.groups[groupID] = gwr } s.writeTask.in <- writeRequest }
func (n *node) initFromWal(wal *raftwal.Wal) (restart bool, rerr error) { n.wal = wal var sp raftpb.Snapshot sp, rerr = wal.Snapshot(n.gid) if rerr != nil { return } var term, idx uint64 if !raft.IsEmptySnap(sp) { fmt.Printf("Found Snapshot: %+v\n", sp) restart = true if rerr = n.store.ApplySnapshot(sp); rerr != nil { return } term = sp.Metadata.Term idx = sp.Metadata.Index } var hd raftpb.HardState hd, rerr = wal.HardState(n.gid) if rerr != nil { return } if !raft.IsEmptyHardState(hd) { fmt.Printf("Found hardstate: %+v\n", sp) restart = true if rerr = n.store.SetHardState(hd); rerr != nil { return } } var es []raftpb.Entry es, rerr = wal.Entries(n.gid, term, idx) if rerr != nil { return } fmt.Printf("Found %d entries\n", len(es)) if len(es) > 0 { restart = true } rerr = n.store.Append(es) return }
func (c *ctrl) readySave(snapshot raftpb.Snapshot, hardState raftpb.HardState, entries []raftpb.Entry) error { // For the moment, none of these steps persist to disk. That violates some Raft // invariants. But we are ephemeral, and will always boot empty, willingly // paying the snapshot cost. I trust that that the etcd Raft implementation // permits this. if !raft.IsEmptySnap(snapshot) { if err := c.storage.ApplySnapshot(snapshot); err != nil { return fmt.Errorf("apply snapshot: %v", err) } } if !raft.IsEmptyHardState(hardState) { if err := c.storage.SetHardState(hardState); err != nil { return fmt.Errorf("set hard state: %v", err) } } if err := c.storage.Append(entries); err != nil { return fmt.Errorf("append: %v", err) } return nil }
func save(rd raft.Ready, st *raft.MemoryStorage) error { if !raft.IsEmptyHardState(rd.HardState) { if err := st.SetHardState(rd.HardState); err != nil { return err } } if len(rd.Entries) > 0 { if err := st.Append(rd.Entries); err != nil { return err } } if !raft.IsEmptySnap(rd.Snapshot) { if err := st.ApplySnapshot(rd.Snapshot); err != nil { return err } } return nil }
// handleWriteReady converts a set of raft.Ready structs into a writeRequest // to be persisted, marks the group as writing and sends it to the writeTask. func (s *state) handleWriteReady() { if log.V(6) { log.Infof("node %v write ready, preparing request", s.nodeID) } s.lockStorage() defer s.unlockStorage() writeRequest := newWriteRequest() for groupID, ready := range s.readyGroups { raftGroupID := roachpb.RangeID(groupID) g, ok := s.groups[raftGroupID] if !ok { if log.V(6) { log.Infof("dropping write request to group %d", groupID) } continue } g.writing = true gwr := &groupWriteRequest{} var err error gwr.replicaID, err = s.Storage().ReplicaIDForStore(roachpb.RangeID(groupID), s.storeID) if err != nil { if log.V(1) { log.Warningf("failed to look up replica ID for range %v (disabling replica ID check): %s", groupID, err) } gwr.replicaID = 0 } if !raft.IsEmptyHardState(ready.HardState) { gwr.state = ready.HardState } if !raft.IsEmptySnap(ready.Snapshot) { gwr.snapshot = ready.Snapshot } if len(ready.Entries) > 0 { gwr.entries = ready.Entries } writeRequest.groups[raftGroupID] = gwr } s.writeTask.in <- writeRequest }
func (n *node) saveToStorage(s raftpb.Snapshot, h raftpb.HardState, es []raftpb.Entry) { if !raft.IsEmptySnap(s) { le, err := n.store.LastIndex() if err != nil { log.Fatalf("While retrieving last index: %v\n", err) } if s.Metadata.Index <= le { return } if err := n.store.ApplySnapshot(s); err != nil { log.Fatalf("Applying snapshot: %v", err) } } if !raft.IsEmptyHardState(h) { n.store.SetHardState(h) } n.store.Append(es) }
// start prepares and starts raftNode in a new goroutine. It is no longer safe // to modify the fields after it has been started. // TODO: Ideally raftNode should get rid of the passed in server structure. func (r *raftNode) start(s *EtcdServer) { r.s = s r.applyc = make(chan apply) r.stopped = make(chan struct{}) r.done = make(chan struct{}) heartbeat := 200 * time.Millisecond if s.Cfg != nil { heartbeat = time.Duration(s.Cfg.TickMs) * time.Millisecond } // set up contention detectors for raft heartbeat message. // expect to send a heartbeat within 2 heartbeat intervals. r.td = contention.NewTimeoutDetector(2 * heartbeat) go func() { var syncC <-chan time.Time defer r.onStop() islead := false for { select { case <-r.ticker: r.Tick() case rd := <-r.Ready(): if rd.SoftState != nil { if lead := atomic.LoadUint64(&r.lead); rd.SoftState.Lead != raft.None && lead != rd.SoftState.Lead { r.mu.Lock() r.lt = time.Now() r.mu.Unlock() leaderChanges.Inc() } if rd.SoftState.Lead == raft.None { hasLeader.Set(0) } else { hasLeader.Set(1) } atomic.StoreUint64(&r.lead, rd.SoftState.Lead) if rd.RaftState == raft.StateLeader { islead = true // TODO: raft should send server a notification through chan when // it promotes or demotes instead of modifying server directly. syncC = r.s.SyncTicker if r.s.lessor != nil { r.s.lessor.Promote(r.s.Cfg.electionTimeout()) } // TODO: remove the nil checking // current test utility does not provide the stats if r.s.stats != nil { r.s.stats.BecomeLeader() } if r.s.compactor != nil { r.s.compactor.Resume() } r.td.Reset() } else { islead = false if r.s.lessor != nil { r.s.lessor.Demote() } if r.s.compactor != nil { r.s.compactor.Pause() } syncC = nil } } raftDone := make(chan struct{}, 1) ap := apply{ entries: rd.CommittedEntries, snapshot: rd.Snapshot, raftDone: raftDone, } select { case r.applyc <- ap: case <-r.stopped: return } // the leader can write to its disk in parallel with replicating to the followers and them // writing to their disks. // For more details, check raft thesis 10.2.1 if islead { r.s.send(rd.Messages) } if !raft.IsEmptySnap(rd.Snapshot) { if err := r.storage.SaveSnap(rd.Snapshot); err != nil { plog.Fatalf("raft save snapshot error: %v", err) } r.raftStorage.ApplySnapshot(rd.Snapshot) plog.Infof("raft applied incoming snapshot at index %d", rd.Snapshot.Metadata.Index) } if err := r.storage.Save(rd.HardState, rd.Entries); err != nil { plog.Fatalf("raft save state and entries error: %v", err) } if !raft.IsEmptyHardState(rd.HardState) { proposalsCommitted.Set(float64(rd.HardState.Commit)) } r.raftStorage.Append(rd.Entries) if !islead { r.s.send(rd.Messages) } raftDone <- struct{}{} r.Advance() case <-syncC: r.s.sync(r.s.Cfg.ReqTimeout()) case <-r.stopped: return } } }() }