// InitialState implements the raft.Storage interface. func (r *Range) InitialState() (raftpb.HardState, raftpb.ConfState, error) { var hs raftpb.HardState found, err := engine.MVCCGetProto(r.rm.Engine(), keys.RaftHardStateKey(r.Desc().RaftID), proto.ZeroTimestamp, true, nil, &hs) if err != nil { return raftpb.HardState{}, raftpb.ConfState{}, err } if !found { // We don't have a saved HardState, so set up the defaults. if r.isInitialized() { // Set the initial log term. hs.Term = raftInitialLogTerm hs.Commit = raftInitialLogIndex atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex) } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. atomic.StoreUint64(&r.lastIndex, 0) } } var cs raftpb.ConfState // For uninitalized ranges, membership is unknown at this point. if found || r.isInitialized() { for _, rep := range r.Desc().Replicas { cs.Nodes = append(cs.Nodes, uint64(proto.MakeRaftNodeID(rep.NodeID, rep.StoreID))) } } return hs, cs, nil }
func setHardState( ctx context.Context, batch engine.ReadWriter, rangeID roachpb.RangeID, st raftpb.HardState, ) error { return engine.MVCCPutProto(ctx, batch, nil, keys.RaftHardStateKey(rangeID), hlc.ZeroTimestamp, nil, &st) }
func loadHardState( ctx context.Context, reader engine.Reader, rangeID roachpb.RangeID, ) (raftpb.HardState, error) { var hs raftpb.HardState found, err := engine.MVCCGetProto(ctx, reader, keys.RaftHardStateKey(rangeID), hlc.ZeroTimestamp, true, nil, &hs) if !found || err != nil { return raftpb.HardState{}, err } return hs, nil }
// createRangeData creates sample range data in all possible areas of // the key space. Returns a slice of the encoded keys of all created // data. func createRangeData(t *testing.T, r *Replica) []engine.MVCCKey { ts0 := hlc.ZeroTimestamp ts := hlc.Timestamp{WallTime: 1} desc := r.Desc() keyTSs := []struct { key roachpb.Key ts hlc.Timestamp }{ {keys.AbortCacheKey(r.RangeID, testTxnID), ts0}, {keys.AbortCacheKey(r.RangeID, testTxnID2), ts0}, {keys.RangeFrozenStatusKey(r.RangeID), ts0}, {keys.RangeLastGCKey(r.RangeID), ts0}, {keys.RaftAppliedIndexKey(r.RangeID), ts0}, {keys.RaftTruncatedStateKey(r.RangeID), ts0}, {keys.LeaseAppliedIndexKey(r.RangeID), ts0}, {keys.RangeStatsKey(r.RangeID), ts0}, {keys.RaftHardStateKey(r.RangeID), ts0}, {keys.RaftLastIndexKey(r.RangeID), ts0}, {keys.RaftLogKey(r.RangeID, 1), ts0}, {keys.RaftLogKey(r.RangeID, 2), ts0}, {keys.RangeLastReplicaGCTimestampKey(r.RangeID), ts0}, {keys.RangeLastVerificationTimestampKey(r.RangeID), ts0}, {keys.RangeDescriptorKey(desc.StartKey), ts}, {keys.TransactionKey(roachpb.Key(desc.StartKey), uuid.NewV4()), ts0}, {keys.TransactionKey(roachpb.Key(desc.StartKey.Next()), uuid.NewV4()), ts0}, {keys.TransactionKey(fakePrevKey(desc.EndKey), uuid.NewV4()), ts0}, // TODO(bdarnell): KeyMin.Next() results in a key in the reserved system-local space. // Once we have resolved https://github.com/cockroachdb/cockroach/issues/437, // replace this with something that reliably generates the first valid key in the range. //{r.Desc().StartKey.Next(), ts}, // The following line is similar to StartKey.Next() but adds more to the key to // avoid falling into the system-local space. {append(append([]byte{}, desc.StartKey...), '\x02'), ts}, {fakePrevKey(r.Desc().EndKey), ts}, } keys := []engine.MVCCKey{} for _, keyTS := range keyTSs { if err := engine.MVCCPut(context.Background(), r.store.Engine(), nil, keyTS.key, keyTS.ts, roachpb.MakeValueFromString("value"), nil); err != nil { t.Fatal(err) } keys = append(keys, engine.MVCCKey{Key: keyTS.key, Timestamp: keyTS.ts}) } return keys }
// createRangeData creates sample range data in all possible areas of // the key space. Returns a slice of the encoded keys of all created // data. func createRangeData(r *Replica, t *testing.T) []roachpb.EncodedKey { ts0 := roachpb.ZeroTimestamp ts := roachpb.Timestamp{WallTime: 1} keyTSs := []struct { key roachpb.Key ts roachpb.Timestamp }{ {keys.ResponseCacheKey(r.Desc().RangeID, &roachpb.ClientCmdID{WallTime: 1, Random: 1}), ts0}, {keys.ResponseCacheKey(r.Desc().RangeID, &roachpb.ClientCmdID{WallTime: 2, Random: 2}), ts0}, {keys.RaftHardStateKey(r.Desc().RangeID), ts0}, {keys.RaftLogKey(r.Desc().RangeID, 1), ts0}, {keys.RaftLogKey(r.Desc().RangeID, 2), ts0}, {keys.RangeGCMetadataKey(r.Desc().RangeID), ts0}, {keys.RangeLastVerificationTimestampKey(r.Desc().RangeID), ts0}, {keys.RangeStatsKey(r.Desc().RangeID), ts0}, {keys.RangeDescriptorKey(r.Desc().StartKey), ts}, {keys.TransactionKey(roachpb.Key(r.Desc().StartKey), []byte("1234")), ts0}, {keys.TransactionKey(roachpb.Key(r.Desc().StartKey.Next()), []byte("5678")), ts0}, {keys.TransactionKey(fakePrevKey(r.Desc().EndKey), []byte("2468")), ts0}, // TODO(bdarnell): KeyMin.Next() results in a key in the reserved system-local space. // Once we have resolved https://github.com/cockroachdb/cockroach/issues/437, // replace this with something that reliably generates the first valid key in the range. //{r.Desc().StartKey.Next(), ts}, // The following line is similar to StartKey.Next() but adds more to the key to // avoid falling into the system-local space. {append(append([]byte{}, r.Desc().StartKey...), '\x01'), ts}, {fakePrevKey(r.Desc().EndKey), ts}, } keys := []roachpb.EncodedKey{} for _, keyTS := range keyTSs { if err := engine.MVCCPut(r.store.Engine(), nil, keyTS.key, keyTS.ts, roachpb.MakeValueFromString("value"), nil); err != nil { t.Fatal(err) } keys = append(keys, engine.MVCCEncodeKey(keyTS.key)) if !keyTS.ts.Equal(ts0) { keys = append(keys, engine.MVCCEncodeVersionKey(keyTS.key, keyTS.ts)) } } return keys }
// InitialState implements the raft.Storage interface. func (r *Replica) InitialState() (raftpb.HardState, raftpb.ConfState, error) { var hs raftpb.HardState desc := r.Desc() found, err := engine.MVCCGetProto(r.store.Engine(), keys.RaftHardStateKey(desc.RangeID), roachpb.ZeroTimestamp, true, nil, &hs) if err != nil { return raftpb.HardState{}, raftpb.ConfState{}, err } initialized := r.isInitialized() if !found { // We don't have a saved HardState, so set up the defaults. if initialized { // Set the initial log term. hs.Term = raftInitialLogTerm hs.Commit = raftInitialLogIndex atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex) } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. atomic.StoreUint64(&r.lastIndex, 0) } } else if initialized && hs.Commit == 0 { // Normally, when the commit index changes, raft gives us a new // commit index to persist, however, during initialization, which // occurs entirely in cockroach, raft has no knowledge of this. // By setting this to the initial log index, we avoid a panic in // raft caused by this inconsistency. hs.Commit = raftInitialLogIndex } var cs raftpb.ConfState // For uninitalized ranges, membership is unknown at this point. if found || initialized { for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } } return hs, cs, nil }
// SetHardState implements the multiraft.WriteableGroupStorage interface. func (r *Range) SetHardState(st raftpb.HardState) error { return engine.MVCCPutProto(r.rm.Engine(), nil, keys.RaftHardStateKey(r.Desc().RaftID), proto.ZeroTimestamp, nil, &st) }
// ApplySnapshot implements the multiraft.WriteableGroupStorage interface. func (r *Range) ApplySnapshot(snap raftpb.Snapshot) error { snapData := proto.RaftSnapshotData{} err := gogoproto.Unmarshal(snap.Data, &snapData) if err != nil { return err } // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. hardStateKey := keys.RaftHardStateKey(r.Desc().RaftID) hardState, _, err := engine.MVCCGet(r.rm.Engine(), hardStateKey, proto.ZeroTimestamp, true /* consistent */, nil) if err != nil { return err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor batch := r.rm.Engine().NewBatch() defer batch.Close() // Delete everything in the range and recreate it from the snapshot. for iter := newRangeDataIterator(&desc, r.rm.Engine()); iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return err } } // Write the snapshot into the range. for _, kv := range snapData.KV { if err := batch.Put(kv.Key, kv.Value); err != nil { return err } } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, proto.ZeroTimestamp, nil) if err != nil { return err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, proto.ZeroTimestamp, *hardState, nil) if err != nil { return err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RaftID) if err != nil { return err } // Copy range stats to new range. oldStats := r.stats r.stats, err = newRangeStats(desc.RaftID, batch) if err != nil { r.stats = oldStats return err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, r.Desc().RaftID, snap.Metadata.Index); err != nil { return err } if err := batch.Commit(); err != nil { return err } // As outlined above, last and applied index are the same after applying // the snapshot. atomic.StoreUint64(&r.lastIndex, snap.Metadata.Index) atomic.StoreUint64(&r.appliedIndex, snap.Metadata.Index) // Atomically update the descriptor and lease. if err := r.setDesc(&desc); err != nil { return err } atomic.StorePointer(&r.lease, unsafe.Pointer(lease)) return nil }
// setHardState persists the raft HardState. func (r *Replica) setHardState(batch engine.Engine, st raftpb.HardState) error { return engine.MVCCPutProto(batch, nil, keys.RaftHardStateKey(r.RangeID), roachpb.ZeroTimestamp, nil, &st) }
// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(batch engine.Engine, snap raftpb.Snapshot) (uint64, error) { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } rangeID := r.RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. This is // usually unnecessary because a snapshot is nearly always // accompanied by a new HardState which incorporates both our former // state and new information from the leader, but in the event that // the HardState has not changed, we want to use our own previous // HardState and not one that was transmitted via the snapshot. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(batch, hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. if _, err := r.append(batch, 0, snapData.LogEntries); err != nil { return 0, err } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return 0, err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return 0, err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return 0, err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return 0, err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return 0, err } batch.Defer(func() { // Update the range stats. r.stats.Replace(newStats) r.mu.Lock() // As outlined above, last and applied index are the same after applying // the snapshot. r.mu.appliedIndex = snap.Metadata.Index r.mu.leaderLease = lease r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } }) return snap.Metadata.Index, nil }
// SetHardState implements the multiraft.WriteableGroupStorage interface. func (r *Replica) SetHardState(st raftpb.HardState) error { return engine.MVCCPutProto(r.store.Engine(), nil, keys.RaftHardStateKey(r.Desc().RangeID), roachpb.ZeroTimestamp, nil, &st) }
// applySnapshot updates the replica based on the given snapshot. func (r *Replica) applySnapshot(snap raftpb.Snapshot) error { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return err } rangeID := r.Desc().RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(r.store.Engine(), hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor batch := r.store.Engine().NewBatch() defer batch.Close() // Delete everything in the range and recreate it from the snapshot. iter := newReplicaDataIterator(&desc, r.store.Engine()) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return err } } // Write the snapshot into the range. for _, kv := range snapData.KV { mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return err } } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return err } if err := batch.Commit(); err != nil { return err } // Update the range stats. r.stats.Replace(newStats) // As outlined above, last and applied index are the same after applying // the snapshot. atomic.StoreUint64(&r.lastIndex, snap.Metadata.Index) atomic.StoreUint64(&r.appliedIndex, snap.Metadata.Index) // Atomically update the descriptor and lease. if err := r.setDesc(&desc); err != nil { return err } // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarangee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(); err != nil { return err } atomic.StorePointer(&r.lease, unsafe.Pointer(lease)) return nil }
// setHardState persists the raft HardState. func (r *Replica) setHardState(batch engine.ReadWriter, st raftpb.HardState) error { return engine.MVCCPutProto(context.Background(), batch, nil, keys.RaftHardStateKey(r.RangeID), roachpb.ZeroTimestamp, nil, &st) }