Beispiel #1
0
// synthesizeHardState synthesizes a HardState from the given ReplicaState and
// any existing on-disk HardState in the context of a snapshot, while verifying
// that the application of the snapshot does not violate Raft invariants. It
// must be called after the supplied state and ReadWriter have been updated
// with the result of the snapshot.
// If there is an existing HardState, we must respect it and we must not apply
// a snapshot that would move the state backwards.
func synthesizeHardState(
	ctx context.Context, eng engine.ReadWriter, s storagebase.ReplicaState, oldHS raftpb.HardState,
) error {
	newHS := raftpb.HardState{
		Term: s.TruncatedState.Term,
		// Note that when applying a Raft snapshot, the applied index is
		// equal to the Commit index represented by the snapshot.
		Commit: s.RaftAppliedIndex,
	}

	if oldHS.Commit > newHS.Commit {
		return errors.Errorf("can't decrease HardState.Commit from %d to %d",
			oldHS.Commit, newHS.Commit)
	}
	if oldHS.Term > newHS.Term {
		// The existing HardState is allowed to be ahead of us, which is
		// relevant in practice for the split trigger. We already checked above
		// that we're not rewinding the acknowledged index, and we haven't
		// updated votes yet.
		newHS.Term = oldHS.Term
	}
	// If the existing HardState voted in this term, remember that.
	if oldHS.Term == newHS.Term {
		newHS.Vote = oldHS.Vote
	}
	return errors.Wrapf(setHardState(ctx, eng, s.Desc.RangeID, newHS), "writing HardState %+v", &newHS)
}
// InitialState implements the raft.Storage interface.
func (r *Range) InitialState() (raftpb.HardState, raftpb.ConfState, error) {
	var hs raftpb.HardState
	found, err := engine.MVCCGetProto(r.rm.Engine(), keys.RaftHardStateKey(r.Desc().RaftID),
		proto.ZeroTimestamp, true, nil, &hs)
	if err != nil {
		return raftpb.HardState{}, raftpb.ConfState{}, err
	}
	if !found {
		// We don't have a saved HardState, so set up the defaults.
		if r.isInitialized() {
			// Set the initial log term.
			hs.Term = raftInitialLogTerm
			hs.Commit = raftInitialLogIndex

			atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex)
		} else {
			// This is a new range we are receiving from another node. Start
			// from zero so we will receive a snapshot.
			atomic.StoreUint64(&r.lastIndex, 0)
		}
	}

	var cs raftpb.ConfState
	// For uninitalized ranges, membership is unknown at this point.
	if found || r.isInitialized() {
		for _, rep := range r.Desc().Replicas {
			cs.Nodes = append(cs.Nodes, uint64(proto.MakeRaftNodeID(rep.NodeID, rep.StoreID)))
		}
	}

	return hs, cs, nil
}
Beispiel #3
0
func mustUnmarshalState(d []byte) raftpb.HardState {
	var s raftpb.HardState
	if err := s.Unmarshal(d); err != nil {
		panic(err)
	}
	return s
}
Beispiel #4
0
// Don't call this multiple times concurrently
func (s *raftStorage) save(state raftpb.HardState, entries []raftpb.Entry) error {
	wb := s.db.NewBatch()
	if !raft.IsEmptyHardState(state) {
		stateBytes, err := state.Marshal()
		if err != nil {
			return err
		}
		wb.Put(s.hardStateKey, stateBytes)
	}
	if len(entries) > 0 {
		lastIndex, err := s.LastIndex()
		if err != nil {
			return err
		}
		if entries[0].Index > lastIndex+1 {
			panic(fmt.Errorf("missing log entries [last: %d, append at: %d]", lastIndex, entries[0].Index))
		}
		// clear all old entries past the new index, if any
		for ix := entries[0].Index; ix <= lastIndex; ix++ {
			wb.Delete(s.getEntryKey(ix))
		}
		// append the new entries
		for _, entry := range entries {
			entryBytes, err := entry.Marshal()
			if err != nil {
				return err
			}
			wb.Put(s.getEntryKey(entry.Index), entryBytes)
		}
	}
	err := s.db.Write(wb)
	return err
}
Beispiel #5
0
func updateHardState(eng engine.ReadWriter, s storagebase.ReplicaState) error {
	// Load a potentially existing HardState as we may need to preserve
	// information about cast votes. For example, during a Split for which
	// another node's new right-hand side has contacted us before our left-hand
	// side called in here to create the group.
	rangeID := s.Desc.RangeID
	oldHS, err := loadHardState(eng, rangeID)
	if err != nil {
		return err
	}

	newHS := raftpb.HardState{
		Term:   s.TruncatedState.Term,
		Commit: s.RaftAppliedIndex,
	}

	if !raft.IsEmptyHardState(oldHS) {
		if oldHS.Commit > newHS.Commit {
			newHS.Commit = oldHS.Commit
		}
		if oldHS.Term > newHS.Term {
			newHS.Term = oldHS.Term
		}
		newHS.Vote = oldHS.Vote
	}

	return setHardState(eng, rangeID, newHS)
}
Beispiel #6
0
// writeInitialState bootstraps a new Raft group (i.e. it is called when we
// bootstrap a Range, or when setting up the right hand side of a split).
// Its main task is to persist a consistent Raft (and associated Replica) state
// which does not start from zero but presupposes a few entries already having
// applied.
// The supplied MVCCStats are used for the Stats field after adjusting for
// persisting the state itself, and the updated stats are returned.
func writeInitialState(
	eng engine.ReadWriter, ms enginepb.MVCCStats, desc roachpb.RangeDescriptor,
) (enginepb.MVCCStats, error) {
	rangeID := desc.RangeID
	var s storagebase.ReplicaState

	s.TruncatedState = &roachpb.RaftTruncatedState{
		Term:  raftInitialLogTerm,
		Index: raftInitialLogIndex,
	}
	s.RaftAppliedIndex = s.TruncatedState.Index
	s.Desc = &roachpb.RangeDescriptor{
		RangeID: rangeID,
	}
	s.Stats = ms

	newMS, err := saveState(eng, s)
	if err != nil {
		return enginepb.MVCCStats{}, err
	}

	// Load a potentially existing HardState as we may need to preserve
	// information about cast votes. For example, during a Split for which
	// another node's new right-hand side has contacted us before our left-hand
	// side called in here to create the group.
	oldHS, err := loadHardState(eng, rangeID)
	if err != nil {
		return enginepb.MVCCStats{}, err
	}

	newHS := raftpb.HardState{
		Term:   s.TruncatedState.Term,
		Commit: s.TruncatedState.Index,
	}

	if !raft.IsEmptyHardState(oldHS) {
		if oldHS.Commit > newHS.Commit {
			newHS.Commit = oldHS.Commit
		}
		if oldHS.Term > newHS.Term {
			newHS.Term = oldHS.Term
		}
		newHS.Vote = oldHS.Vote
	}

	if err := setHardState(eng, rangeID, newHS); err != nil {
		return enginepb.MVCCStats{}, err
	}

	if err := setLastIndex(eng, rangeID, s.TruncatedState.Index); err != nil {
		return enginepb.MVCCStats{}, err
	}

	return newMS, nil
}
Beispiel #7
0
func (w *WAL) SaveState(s *raftpb.HardState) error {
	if raft.IsEmptyHardState(*s) {
		return nil
	}
	b, err := s.Marshal()
	if err != nil {
		panic(err)
	}
	rec := &walpb.Record{Type: stateType, Data: b}
	return w.encoder.encode(rec)
}
Beispiel #8
0
func (w *WAL) SaveState(s *raftpb.HardState) error {
	if raft.IsEmptyHardState(*s) {
		return nil
	}
	log.Printf("path=%s wal.saveState state=\"%+v\"", w.f.Name(), s)
	b, err := s.Marshal()
	if err != nil {
		panic(err)
	}
	rec := &walpb.Record{Type: stateType, Data: b}
	return w.encoder.encode(rec)
}
Beispiel #9
0
// Store stores the snapshot, hardstate and entries for a given RAFT group.
func (w *Wal) Store(gid uint32, s raftpb.Snapshot, h raftpb.HardState, es []raftpb.Entry) error {
	b := w.wals.NewWriteBatch()
	defer b.Destroy()

	if !raft.IsEmptySnap(s) {
		data, err := s.Marshal()
		if err != nil {
			return x.Wrapf(err, "wal.Store: While marshal snapshot")
		}
		b.Put(w.snapshotKey(gid), data)
	}

	if !raft.IsEmptyHardState(h) {
		data, err := h.Marshal()
		if err != nil {
			return x.Wrapf(err, "wal.Store: While marshal hardstate")
		}
		b.Put(w.hardStateKey(gid), data)
	}

	var t, i uint64
	for _, e := range es {
		t, i = e.Term, e.Index
		data, err := e.Marshal()
		if err != nil {
			return x.Wrapf(err, "wal.Store: While marshal entry")
		}
		k := w.entryKey(gid, e.Term, e.Index)
		b.Put(k, data)
	}

	// If we get no entries, then the default value of t and i would be zero. That would
	// end up deleting all the previous valid raft entry logs. This check avoids that.
	if t > 0 || i > 0 {
		// Delete all keys above this index.
		start := w.entryKey(gid, t, i+1)
		prefix := w.prefix(gid)
		itr := w.wals.NewIterator()
		defer itr.Close()

		for itr.Seek(start); itr.ValidForPrefix(prefix); itr.Next() {
			b.Delete(itr.Key().Data())
		}
	}

	err := w.wals.WriteBatch(b)
	return x.Wrapf(err, "wal.Store: While WriteBatch")
}
// InitialState implements the raft.Storage interface.
func (r *Replica) InitialState() (raftpb.HardState, raftpb.ConfState, error) {
	var hs raftpb.HardState
	desc := r.Desc()
	found, err := engine.MVCCGetProto(r.store.Engine(), keys.RaftHardStateKey(desc.RangeID),
		roachpb.ZeroTimestamp, true, nil, &hs)
	if err != nil {
		return raftpb.HardState{}, raftpb.ConfState{}, err
	}
	initialized := r.isInitialized()
	if !found {
		// We don't have a saved HardState, so set up the defaults.
		if initialized {
			// Set the initial log term.
			hs.Term = raftInitialLogTerm
			hs.Commit = raftInitialLogIndex

			atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex)
		} else {
			// This is a new range we are receiving from another node. Start
			// from zero so we will receive a snapshot.
			atomic.StoreUint64(&r.lastIndex, 0)
		}
	} else if initialized && hs.Commit == 0 {
		// Normally, when the commit index changes, raft gives us a new
		// commit index to persist, however, during initialization, which
		// occurs entirely in cockroach, raft has no knowledge of this.
		// By setting this to the initial log index, we avoid a panic in
		// raft caused by this inconsistency.
		hs.Commit = raftInitialLogIndex
	}

	var cs raftpb.ConfState
	// For uninitalized ranges, membership is unknown at this point.
	if found || initialized {
		for _, rep := range desc.Replicas {
			cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID))
		}
	}

	return hs, cs, nil
}
Beispiel #11
0
func (n *Node) readWAL(ctx context.Context, snapshot *raftpb.Snapshot, forceNewCluster bool) (err error) {
	var (
		walsnap  walpb.Snapshot
		metadata []byte
		st       raftpb.HardState
		ents     []raftpb.Entry
	)

	if snapshot != nil {
		walsnap.Index = snapshot.Metadata.Index
		walsnap.Term = snapshot.Metadata.Term
	}

	repaired := false
	for {
		if n.wal, err = wal.Open(n.walDir(), walsnap); err != nil {
			return fmt.Errorf("open WAL error: %v", err)
		}
		if metadata, st, ents, err = n.wal.ReadAll(); err != nil {
			if err := n.wal.Close(); err != nil {
				return err
			}
			// we can only repair ErrUnexpectedEOF and we never repair twice.
			if repaired || err != io.ErrUnexpectedEOF {
				return fmt.Errorf("read WAL error (%v) and cannot be repaired", err)
			}
			if !wal.Repair(n.walDir()) {
				return fmt.Errorf("WAL error (%v) cannot be repaired", err)
			}
			log.G(ctx).Infof("repaired WAL error (%v)", err)
			repaired = true
			continue
		}
		break
	}

	defer func() {
		if err != nil {
			if walErr := n.wal.Close(); walErr != nil {
				n.Config.Logger.Errorf("error closing raft WAL: %v", walErr)
			}
		}
	}()

	var raftNode api.RaftMember
	if err := raftNode.Unmarshal(metadata); err != nil {
		return fmt.Errorf("error unmarshalling WAL metadata: %v", err)
	}
	n.Config.ID = raftNode.RaftID

	// All members that are no longer part of the cluster must be added to
	// the removed list right away, so that we don't try to connect to them
	// before processing the configuration change entries, which could make
	// us get stuck.
	for _, ent := range ents {
		if ent.Index <= st.Commit && ent.Type == raftpb.EntryConfChange {
			var cc raftpb.ConfChange
			if err := cc.Unmarshal(ent.Data); err != nil {
				return fmt.Errorf("error unmarshalling config change: %v", err)
			}
			if cc.Type == raftpb.ConfChangeRemoveNode {
				n.cluster.RemoveMember(cc.NodeID)
			}
		}
	}

	if forceNewCluster {
		// discard the previously uncommitted entries
		for i, ent := range ents {
			if ent.Index > st.Commit {
				log.G(context.Background()).Infof("discarding %d uncommitted WAL entries ", len(ents)-i)
				ents = ents[:i]
				break
			}
		}

		// force append the configuration change entries
		toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(n.Config.ID), st.Term, st.Commit)

		// All members that are being removed as part of the
		// force-new-cluster process must be added to the
		// removed list right away, so that we don't try to
		// connect to them before processing the configuration
		// change entries, which could make us get stuck.
		for _, ccEnt := range toAppEnts {
			if ccEnt.Type == raftpb.EntryConfChange {
				var cc raftpb.ConfChange
				if err := cc.Unmarshal(ccEnt.Data); err != nil {
					return fmt.Errorf("error unmarshalling force-new-cluster config change: %v", err)
				}
				if cc.Type == raftpb.ConfChangeRemoveNode {
					n.cluster.RemoveMember(cc.NodeID)
				}
			}
		}
		ents = append(ents, toAppEnts...)

		// force commit newly appended entries
		err := n.wal.Save(st, toAppEnts)
		if err != nil {
			log.G(context.Background()).Fatalf("%v", err)
		}
		if len(toAppEnts) != 0 {
			st.Commit = toAppEnts[len(toAppEnts)-1].Index
		}
	}

	if snapshot != nil {
		if err := n.raftStore.ApplySnapshot(*snapshot); err != nil {
			return err
		}
	}
	if err := n.raftStore.SetHardState(st); err != nil {
		return err
	}
	if err := n.raftStore.Append(ents); err != nil {
		return err
	}

	return nil
}
Beispiel #12
0
func (n *Node) readWAL(ctx context.Context, snapshot *raftpb.Snapshot, forceNewCluster bool) (err error) {
	var (
		walsnap  walpb.Snapshot
		metadata []byte
		st       raftpb.HardState
		ents     []raftpb.Entry
	)

	if snapshot != nil {
		walsnap.Index = snapshot.Metadata.Index
		walsnap.Term = snapshot.Metadata.Term
	}

	repaired := false
	for {
		if n.wal, err = wal.Open(n.walDir(), walsnap); err != nil {
			return fmt.Errorf("open wal error: %v", err)
		}
		if metadata, st, ents, err = n.wal.ReadAll(); err != nil {
			if err := n.wal.Close(); err != nil {
				return err
			}
			// we can only repair ErrUnexpectedEOF and we never repair twice.
			if repaired || err != io.ErrUnexpectedEOF {
				return fmt.Errorf("read wal error (%v) and cannot be repaired", err)
			}
			if !wal.Repair(n.walDir()) {
				return fmt.Errorf("WAL error (%v) cannot be repaired", err)
			}
			log.G(ctx).Infof("repaired WAL error (%v)", err)
			repaired = true
			continue
		}
		break
	}

	defer func() {
		if err != nil {
			if walErr := n.wal.Close(); walErr != nil {
				n.Config.Logger.Errorf("error closing raft WAL: %v", walErr)
			}
		}
	}()

	var raftNode api.RaftMember
	if err := raftNode.Unmarshal(metadata); err != nil {
		return fmt.Errorf("error unmarshalling wal metadata: %v", err)
	}
	n.Config.ID = raftNode.RaftID

	if forceNewCluster {
		// discard the previously uncommitted entries
		for i, ent := range ents {
			if ent.Index > st.Commit {
				log.G(context.Background()).Infof("discarding %d uncommitted WAL entries ", len(ents)-i)
				ents = ents[:i]
				break
			}
		}

		// force append the configuration change entries
		toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(n.Config.ID), st.Term, st.Commit)
		ents = append(ents, toAppEnts...)

		// force commit newly appended entries
		err := n.wal.Save(st, toAppEnts)
		if err != nil {
			log.G(context.Background()).Fatalf("%v", err)
		}
		if len(toAppEnts) != 0 {
			st.Commit = toAppEnts[len(toAppEnts)-1].Index
		}
	}

	if snapshot != nil {
		if err := n.raftStore.ApplySnapshot(*snapshot); err != nil {
			return err
		}
	}
	if err := n.raftStore.SetHardState(st); err != nil {
		return err
	}
	if err := n.raftStore.Append(ents); err != nil {
		return err
	}

	return nil
}