// synthesizeHardState synthesizes a HardState from the given ReplicaState and // any existing on-disk HardState in the context of a snapshot, while verifying // that the application of the snapshot does not violate Raft invariants. It // must be called after the supplied state and ReadWriter have been updated // with the result of the snapshot. // If there is an existing HardState, we must respect it and we must not apply // a snapshot that would move the state backwards. func synthesizeHardState( ctx context.Context, eng engine.ReadWriter, s storagebase.ReplicaState, oldHS raftpb.HardState, ) error { newHS := raftpb.HardState{ Term: s.TruncatedState.Term, // Note that when applying a Raft snapshot, the applied index is // equal to the Commit index represented by the snapshot. Commit: s.RaftAppliedIndex, } if oldHS.Commit > newHS.Commit { return errors.Errorf("can't decrease HardState.Commit from %d to %d", oldHS.Commit, newHS.Commit) } if oldHS.Term > newHS.Term { // The existing HardState is allowed to be ahead of us, which is // relevant in practice for the split trigger. We already checked above // that we're not rewinding the acknowledged index, and we haven't // updated votes yet. newHS.Term = oldHS.Term } // If the existing HardState voted in this term, remember that. if oldHS.Term == newHS.Term { newHS.Vote = oldHS.Vote } return errors.Wrapf(setHardState(ctx, eng, s.Desc.RangeID, newHS), "writing HardState %+v", &newHS) }
// InitialState implements the raft.Storage interface. func (r *Range) InitialState() (raftpb.HardState, raftpb.ConfState, error) { var hs raftpb.HardState found, err := engine.MVCCGetProto(r.rm.Engine(), keys.RaftHardStateKey(r.Desc().RaftID), proto.ZeroTimestamp, true, nil, &hs) if err != nil { return raftpb.HardState{}, raftpb.ConfState{}, err } if !found { // We don't have a saved HardState, so set up the defaults. if r.isInitialized() { // Set the initial log term. hs.Term = raftInitialLogTerm hs.Commit = raftInitialLogIndex atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex) } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. atomic.StoreUint64(&r.lastIndex, 0) } } var cs raftpb.ConfState // For uninitalized ranges, membership is unknown at this point. if found || r.isInitialized() { for _, rep := range r.Desc().Replicas { cs.Nodes = append(cs.Nodes, uint64(proto.MakeRaftNodeID(rep.NodeID, rep.StoreID))) } } return hs, cs, nil }
func mustUnmarshalState(d []byte) raftpb.HardState { var s raftpb.HardState if err := s.Unmarshal(d); err != nil { panic(err) } return s }
// Don't call this multiple times concurrently func (s *raftStorage) save(state raftpb.HardState, entries []raftpb.Entry) error { wb := s.db.NewBatch() if !raft.IsEmptyHardState(state) { stateBytes, err := state.Marshal() if err != nil { return err } wb.Put(s.hardStateKey, stateBytes) } if len(entries) > 0 { lastIndex, err := s.LastIndex() if err != nil { return err } if entries[0].Index > lastIndex+1 { panic(fmt.Errorf("missing log entries [last: %d, append at: %d]", lastIndex, entries[0].Index)) } // clear all old entries past the new index, if any for ix := entries[0].Index; ix <= lastIndex; ix++ { wb.Delete(s.getEntryKey(ix)) } // append the new entries for _, entry := range entries { entryBytes, err := entry.Marshal() if err != nil { return err } wb.Put(s.getEntryKey(entry.Index), entryBytes) } } err := s.db.Write(wb) return err }
func updateHardState(eng engine.ReadWriter, s storagebase.ReplicaState) error { // Load a potentially existing HardState as we may need to preserve // information about cast votes. For example, during a Split for which // another node's new right-hand side has contacted us before our left-hand // side called in here to create the group. rangeID := s.Desc.RangeID oldHS, err := loadHardState(eng, rangeID) if err != nil { return err } newHS := raftpb.HardState{ Term: s.TruncatedState.Term, Commit: s.RaftAppliedIndex, } if !raft.IsEmptyHardState(oldHS) { if oldHS.Commit > newHS.Commit { newHS.Commit = oldHS.Commit } if oldHS.Term > newHS.Term { newHS.Term = oldHS.Term } newHS.Vote = oldHS.Vote } return setHardState(eng, rangeID, newHS) }
// writeInitialState bootstraps a new Raft group (i.e. it is called when we // bootstrap a Range, or when setting up the right hand side of a split). // Its main task is to persist a consistent Raft (and associated Replica) state // which does not start from zero but presupposes a few entries already having // applied. // The supplied MVCCStats are used for the Stats field after adjusting for // persisting the state itself, and the updated stats are returned. func writeInitialState( eng engine.ReadWriter, ms enginepb.MVCCStats, desc roachpb.RangeDescriptor, ) (enginepb.MVCCStats, error) { rangeID := desc.RangeID var s storagebase.ReplicaState s.TruncatedState = &roachpb.RaftTruncatedState{ Term: raftInitialLogTerm, Index: raftInitialLogIndex, } s.RaftAppliedIndex = s.TruncatedState.Index s.Desc = &roachpb.RangeDescriptor{ RangeID: rangeID, } s.Stats = ms newMS, err := saveState(eng, s) if err != nil { return enginepb.MVCCStats{}, err } // Load a potentially existing HardState as we may need to preserve // information about cast votes. For example, during a Split for which // another node's new right-hand side has contacted us before our left-hand // side called in here to create the group. oldHS, err := loadHardState(eng, rangeID) if err != nil { return enginepb.MVCCStats{}, err } newHS := raftpb.HardState{ Term: s.TruncatedState.Term, Commit: s.TruncatedState.Index, } if !raft.IsEmptyHardState(oldHS) { if oldHS.Commit > newHS.Commit { newHS.Commit = oldHS.Commit } if oldHS.Term > newHS.Term { newHS.Term = oldHS.Term } newHS.Vote = oldHS.Vote } if err := setHardState(eng, rangeID, newHS); err != nil { return enginepb.MVCCStats{}, err } if err := setLastIndex(eng, rangeID, s.TruncatedState.Index); err != nil { return enginepb.MVCCStats{}, err } return newMS, nil }
func (w *WAL) SaveState(s *raftpb.HardState) error { if raft.IsEmptyHardState(*s) { return nil } b, err := s.Marshal() if err != nil { panic(err) } rec := &walpb.Record{Type: stateType, Data: b} return w.encoder.encode(rec) }
func (w *WAL) SaveState(s *raftpb.HardState) error { if raft.IsEmptyHardState(*s) { return nil } log.Printf("path=%s wal.saveState state=\"%+v\"", w.f.Name(), s) b, err := s.Marshal() if err != nil { panic(err) } rec := &walpb.Record{Type: stateType, Data: b} return w.encoder.encode(rec) }
// Store stores the snapshot, hardstate and entries for a given RAFT group. func (w *Wal) Store(gid uint32, s raftpb.Snapshot, h raftpb.HardState, es []raftpb.Entry) error { b := w.wals.NewWriteBatch() defer b.Destroy() if !raft.IsEmptySnap(s) { data, err := s.Marshal() if err != nil { return x.Wrapf(err, "wal.Store: While marshal snapshot") } b.Put(w.snapshotKey(gid), data) } if !raft.IsEmptyHardState(h) { data, err := h.Marshal() if err != nil { return x.Wrapf(err, "wal.Store: While marshal hardstate") } b.Put(w.hardStateKey(gid), data) } var t, i uint64 for _, e := range es { t, i = e.Term, e.Index data, err := e.Marshal() if err != nil { return x.Wrapf(err, "wal.Store: While marshal entry") } k := w.entryKey(gid, e.Term, e.Index) b.Put(k, data) } // If we get no entries, then the default value of t and i would be zero. That would // end up deleting all the previous valid raft entry logs. This check avoids that. if t > 0 || i > 0 { // Delete all keys above this index. start := w.entryKey(gid, t, i+1) prefix := w.prefix(gid) itr := w.wals.NewIterator() defer itr.Close() for itr.Seek(start); itr.ValidForPrefix(prefix); itr.Next() { b.Delete(itr.Key().Data()) } } err := w.wals.WriteBatch(b) return x.Wrapf(err, "wal.Store: While WriteBatch") }
// InitialState implements the raft.Storage interface. func (r *Replica) InitialState() (raftpb.HardState, raftpb.ConfState, error) { var hs raftpb.HardState desc := r.Desc() found, err := engine.MVCCGetProto(r.store.Engine(), keys.RaftHardStateKey(desc.RangeID), roachpb.ZeroTimestamp, true, nil, &hs) if err != nil { return raftpb.HardState{}, raftpb.ConfState{}, err } initialized := r.isInitialized() if !found { // We don't have a saved HardState, so set up the defaults. if initialized { // Set the initial log term. hs.Term = raftInitialLogTerm hs.Commit = raftInitialLogIndex atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex) } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. atomic.StoreUint64(&r.lastIndex, 0) } } else if initialized && hs.Commit == 0 { // Normally, when the commit index changes, raft gives us a new // commit index to persist, however, during initialization, which // occurs entirely in cockroach, raft has no knowledge of this. // By setting this to the initial log index, we avoid a panic in // raft caused by this inconsistency. hs.Commit = raftInitialLogIndex } var cs raftpb.ConfState // For uninitalized ranges, membership is unknown at this point. if found || initialized { for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } } return hs, cs, nil }
func (n *Node) readWAL(ctx context.Context, snapshot *raftpb.Snapshot, forceNewCluster bool) (err error) { var ( walsnap walpb.Snapshot metadata []byte st raftpb.HardState ents []raftpb.Entry ) if snapshot != nil { walsnap.Index = snapshot.Metadata.Index walsnap.Term = snapshot.Metadata.Term } repaired := false for { if n.wal, err = wal.Open(n.walDir(), walsnap); err != nil { return fmt.Errorf("open WAL error: %v", err) } if metadata, st, ents, err = n.wal.ReadAll(); err != nil { if err := n.wal.Close(); err != nil { return err } // we can only repair ErrUnexpectedEOF and we never repair twice. if repaired || err != io.ErrUnexpectedEOF { return fmt.Errorf("read WAL error (%v) and cannot be repaired", err) } if !wal.Repair(n.walDir()) { return fmt.Errorf("WAL error (%v) cannot be repaired", err) } log.G(ctx).Infof("repaired WAL error (%v)", err) repaired = true continue } break } defer func() { if err != nil { if walErr := n.wal.Close(); walErr != nil { n.Config.Logger.Errorf("error closing raft WAL: %v", walErr) } } }() var raftNode api.RaftMember if err := raftNode.Unmarshal(metadata); err != nil { return fmt.Errorf("error unmarshalling WAL metadata: %v", err) } n.Config.ID = raftNode.RaftID // All members that are no longer part of the cluster must be added to // the removed list right away, so that we don't try to connect to them // before processing the configuration change entries, which could make // us get stuck. for _, ent := range ents { if ent.Index <= st.Commit && ent.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ent.Data); err != nil { return fmt.Errorf("error unmarshalling config change: %v", err) } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } if forceNewCluster { // discard the previously uncommitted entries for i, ent := range ents { if ent.Index > st.Commit { log.G(context.Background()).Infof("discarding %d uncommitted WAL entries ", len(ents)-i) ents = ents[:i] break } } // force append the configuration change entries toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(n.Config.ID), st.Term, st.Commit) // All members that are being removed as part of the // force-new-cluster process must be added to the // removed list right away, so that we don't try to // connect to them before processing the configuration // change entries, which could make us get stuck. for _, ccEnt := range toAppEnts { if ccEnt.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ccEnt.Data); err != nil { return fmt.Errorf("error unmarshalling force-new-cluster config change: %v", err) } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } ents = append(ents, toAppEnts...) // force commit newly appended entries err := n.wal.Save(st, toAppEnts) if err != nil { log.G(context.Background()).Fatalf("%v", err) } if len(toAppEnts) != 0 { st.Commit = toAppEnts[len(toAppEnts)-1].Index } } if snapshot != nil { if err := n.raftStore.ApplySnapshot(*snapshot); err != nil { return err } } if err := n.raftStore.SetHardState(st); err != nil { return err } if err := n.raftStore.Append(ents); err != nil { return err } return nil }
func (n *Node) readWAL(ctx context.Context, snapshot *raftpb.Snapshot, forceNewCluster bool) (err error) { var ( walsnap walpb.Snapshot metadata []byte st raftpb.HardState ents []raftpb.Entry ) if snapshot != nil { walsnap.Index = snapshot.Metadata.Index walsnap.Term = snapshot.Metadata.Term } repaired := false for { if n.wal, err = wal.Open(n.walDir(), walsnap); err != nil { return fmt.Errorf("open wal error: %v", err) } if metadata, st, ents, err = n.wal.ReadAll(); err != nil { if err := n.wal.Close(); err != nil { return err } // we can only repair ErrUnexpectedEOF and we never repair twice. if repaired || err != io.ErrUnexpectedEOF { return fmt.Errorf("read wal error (%v) and cannot be repaired", err) } if !wal.Repair(n.walDir()) { return fmt.Errorf("WAL error (%v) cannot be repaired", err) } log.G(ctx).Infof("repaired WAL error (%v)", err) repaired = true continue } break } defer func() { if err != nil { if walErr := n.wal.Close(); walErr != nil { n.Config.Logger.Errorf("error closing raft WAL: %v", walErr) } } }() var raftNode api.RaftMember if err := raftNode.Unmarshal(metadata); err != nil { return fmt.Errorf("error unmarshalling wal metadata: %v", err) } n.Config.ID = raftNode.RaftID if forceNewCluster { // discard the previously uncommitted entries for i, ent := range ents { if ent.Index > st.Commit { log.G(context.Background()).Infof("discarding %d uncommitted WAL entries ", len(ents)-i) ents = ents[:i] break } } // force append the configuration change entries toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(n.Config.ID), st.Term, st.Commit) ents = append(ents, toAppEnts...) // force commit newly appended entries err := n.wal.Save(st, toAppEnts) if err != nil { log.G(context.Background()).Fatalf("%v", err) } if len(toAppEnts) != 0 { st.Commit = toAppEnts[len(toAppEnts)-1].Index } } if snapshot != nil { if err := n.raftStore.ApplySnapshot(*snapshot); err != nil { return err } } if err := n.raftStore.SetHardState(st); err != nil { return err } if err := n.raftStore.Append(ents); err != nil { return err } return nil }