// Entries implements the raft.Storage interface. Note that maxBytes is advisory // and this method will always return at least one entry even if it exceeds // maxBytes. Passing maxBytes equal to zero disables size checking. // TODO(bdarnell): consider caching for recent entries, if rocksdb's builtin caching // is insufficient. func (r *Range) Entries(lo, hi, maxBytes uint64) ([]raftpb.Entry, error) { // Scan over the log to find the requested entries in the range [lo, hi), // stopping once we have enough. var ents []raftpb.Entry size := uint64(0) var ent raftpb.Entry scanFunc := func(kv proto.KeyValue) (bool, error) { err := gogoproto.Unmarshal(kv.Value.GetBytes(), &ent) if err != nil { return false, err } size += uint64(ent.Size()) ents = append(ents, ent) return maxBytes > 0 && size > maxBytes, nil } _, err := engine.MVCCIterate(r.rm.Engine(), keys.RaftLogKey(r.Desc().RaftID, lo), keys.RaftLogKey(r.Desc().RaftID, hi), proto.ZeroTimestamp, true /* consistent */, nil /* txn */, scanFunc) if err != nil { return nil, err } // If neither the number of entries nor the size limitations had an // effect, we weren't able to supply everything the client wanted. if len(ents) != int(hi-lo) && (maxBytes == 0 || size < maxBytes) { return nil, raft.ErrUnavailable } return ents, nil }
// append the given entries to the raft log. Takes the previous value // of r.lastIndex and returns a new value. We do this rather than // modifying r.lastIndex directly because this modification needs to // be atomic with the commit of the batch. func (r *Replica) append(batch engine.Engine, prevLastIndex uint64, entries []raftpb.Entry) (uint64, error) { if len(entries) == 0 { return prevLastIndex, nil } for i := range entries { ent := &entries[i] key := keys.RaftLogKey(r.RangeID, ent.Index) if err := engine.MVCCPutProto(batch, nil, key, roachpb.ZeroTimestamp, nil, ent); err != nil { return 0, err } } lastIndex := entries[len(entries)-1].Index // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(batch, nil, keys.RaftLogKey(r.RangeID, i), roachpb.ZeroTimestamp, nil) if err != nil { return 0, err } } // Commit the batch and update the last index. if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil { return 0, err } return lastIndex, nil }
// append the given entries to the raft log. Takes the previous values of // r.mu.lastIndex and r.mu.raftLogSize, and returns new values. We do this // rather than modifying them directly because these modifications need to be // atomic with the commit of the batch. func (r *Replica) append(batch engine.ReadWriter, prevLastIndex uint64, prevRaftLogSize int64, entries []raftpb.Entry) (uint64, int64, error) { if len(entries) == 0 { return prevLastIndex, prevRaftLogSize, nil } var diff enginepb.MVCCStats ctx := context.Background() for i := range entries { ent := &entries[i] key := keys.RaftLogKey(r.RangeID, ent.Index) if err := engine.MVCCPutProto(ctx, batch, &diff, key, hlc.ZeroTimestamp, nil /* txn */, ent); err != nil { return 0, 0, err } } lastIndex := entries[len(entries)-1].Index // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(ctx, batch, &diff, keys.RaftLogKey(r.RangeID, i), hlc.ZeroTimestamp, nil /* txn */) if err != nil { return 0, 0, err } } if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil { return 0, 0, err } raftLogSize := prevRaftLogSize + diff.SysBytes return lastIndex, raftLogSize, nil }
// InternalTruncateLog discards a prefix of the raft log. func (r *Range) InternalTruncateLog(batch engine.Engine, ms *engine.MVCCStats, args *proto.InternalTruncateLogRequest, reply *proto.InternalTruncateLogResponse) { // args.Index is the first index to keep. term, err := r.Term(args.Index - 1) if err != nil { reply.SetGoError(err) return } start := keys.RaftLogKey(r.Desc().RaftID, 0) end := keys.RaftLogKey(r.Desc().RaftID, args.Index) err = batch.Iterate(engine.MVCCEncodeKey(start), engine.MVCCEncodeKey(end), func(kv proto.RawKeyValue) (bool, error) { err := batch.Clear(kv.Key) return false, err }) if err != nil { reply.SetGoError(err) return } ts := proto.RaftTruncatedState{ Index: args.Index - 1, Term: term, } err = engine.MVCCPutProto(batch, ms, keys.RaftTruncatedStateKey(r.Desc().RaftID), proto.ZeroTimestamp, nil, &ts) reply.SetGoError(err) }
// append the given entries to the raft log. func (r *Replica) append(batch engine.Engine, entries []raftpb.Entry) error { if len(entries) == 0 { return nil } for _, ent := range entries { err := engine.MVCCPutProto(batch, nil, keys.RaftLogKey(r.RangeID, ent.Index), roachpb.ZeroTimestamp, nil, &ent) if err != nil { return err } } lastIndex := entries[len(entries)-1].Index prevLastIndex := atomic.LoadUint64(&r.lastIndex) // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(batch, nil, keys.RaftLogKey(r.RangeID, i), roachpb.ZeroTimestamp, nil) if err != nil { return err } } // Commit the batch and update the last index. if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil { return err } batch.Defer(func() { atomic.StoreUint64(&r.lastIndex, lastIndex) }) return nil }
// append the given entries to the raft log. Takes the previous value // of r.lastIndex and returns a new value. We do this rather than // modifying r.lastIndex directly because this modification needs to // be atomic with the commit of the batch. func (r *Replica) append(batch engine.ReadWriter, prevLastIndex uint64, entries []raftpb.Entry) (uint64, error) { if len(entries) == 0 { return prevLastIndex, nil } for i := range entries { ent := &entries[i] key := keys.RaftLogKey(r.RangeID, ent.Index) if err := engine.MVCCPutProto(context.Background(), batch, nil, key, hlc.ZeroTimestamp, nil, ent); err != nil { return 0, err } } lastIndex := entries[len(entries)-1].Index // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(context.Background(), batch, nil, keys.RaftLogKey(r.RangeID, i), hlc.ZeroTimestamp, nil) if err != nil { return 0, err } } if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil { return 0, err } return lastIndex, nil }
func iterateEntries( e engine.Reader, rangeID roachpb.RangeID, lo, hi uint64, scanFunc func(roachpb.KeyValue) (bool, error), ) error { _, err := engine.MVCCIterate( context.Background(), e, keys.RaftLogKey(rangeID, lo), keys.RaftLogKey(rangeID, hi), hlc.ZeroTimestamp, true, /* consistent */ nil, /* txn */ false, /* !reverse */ scanFunc, ) return err }
// append the given entries to the raft log. Takes the previous values of // r.mu.lastIndex and r.mu.raftLogSize, and returns new values. We do this // rather than modifying them directly because these modifications need to be // atomic with the commit of the batch. func (r *Replica) append( ctx context.Context, batch engine.ReadWriter, prevLastIndex uint64, prevRaftLogSize int64, entries []raftpb.Entry, ) (uint64, int64, error) { if len(entries) == 0 { return prevLastIndex, prevRaftLogSize, nil } var diff enginepb.MVCCStats var value roachpb.Value for i := range entries { ent := &entries[i] key := keys.RaftLogKey(r.RangeID, ent.Index) if err := value.SetProto(ent); err != nil { return 0, 0, err } value.InitChecksum(key) var err error if ent.Index > prevLastIndex { err = engine.MVCCBlindPut(ctx, batch, &diff, key, hlc.ZeroTimestamp, value, nil /* txn */) } else { err = engine.MVCCPut(ctx, batch, &diff, key, hlc.ZeroTimestamp, value, nil /* txn */) } if err != nil { return 0, 0, err } } // Delete any previously appended log entries which never committed. lastIndex := entries[len(entries)-1].Index for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(ctx, batch, &diff, keys.RaftLogKey(r.RangeID, i), hlc.ZeroTimestamp, nil /* txn */) if err != nil { return 0, 0, err } } if err := setLastIndex(ctx, batch, r.RangeID, lastIndex); err != nil { return 0, 0, err } raftLogSize := prevRaftLogSize + diff.SysBytes return lastIndex, raftLogSize, nil }
// createRangeData creates sample range data in all possible areas of // the key space. Returns a slice of the encoded keys of all created // data. func createRangeData(t *testing.T, r *Replica) []engine.MVCCKey { ts0 := hlc.ZeroTimestamp ts := hlc.Timestamp{WallTime: 1} desc := r.Desc() keyTSs := []struct { key roachpb.Key ts hlc.Timestamp }{ {keys.AbortCacheKey(r.RangeID, testTxnID), ts0}, {keys.AbortCacheKey(r.RangeID, testTxnID2), ts0}, {keys.RangeFrozenStatusKey(r.RangeID), ts0}, {keys.RangeLastGCKey(r.RangeID), ts0}, {keys.RaftAppliedIndexKey(r.RangeID), ts0}, {keys.RaftTruncatedStateKey(r.RangeID), ts0}, {keys.LeaseAppliedIndexKey(r.RangeID), ts0}, {keys.RangeStatsKey(r.RangeID), ts0}, {keys.RaftHardStateKey(r.RangeID), ts0}, {keys.RaftLastIndexKey(r.RangeID), ts0}, {keys.RaftLogKey(r.RangeID, 1), ts0}, {keys.RaftLogKey(r.RangeID, 2), ts0}, {keys.RangeLastReplicaGCTimestampKey(r.RangeID), ts0}, {keys.RangeLastVerificationTimestampKey(r.RangeID), ts0}, {keys.RangeDescriptorKey(desc.StartKey), ts}, {keys.TransactionKey(roachpb.Key(desc.StartKey), uuid.NewV4()), ts0}, {keys.TransactionKey(roachpb.Key(desc.StartKey.Next()), uuid.NewV4()), ts0}, {keys.TransactionKey(fakePrevKey(desc.EndKey), uuid.NewV4()), ts0}, // TODO(bdarnell): KeyMin.Next() results in a key in the reserved system-local space. // Once we have resolved https://github.com/cockroachdb/cockroach/issues/437, // replace this with something that reliably generates the first valid key in the range. //{r.Desc().StartKey.Next(), ts}, // The following line is similar to StartKey.Next() but adds more to the key to // avoid falling into the system-local space. {append(append([]byte{}, desc.StartKey...), '\x02'), ts}, {fakePrevKey(r.Desc().EndKey), ts}, } keys := []engine.MVCCKey{} for _, keyTS := range keyTSs { if err := engine.MVCCPut(context.Background(), r.store.Engine(), nil, keyTS.key, keyTS.ts, roachpb.MakeValueFromString("value"), nil); err != nil { t.Fatal(err) } keys = append(keys, engine.MVCCKey{Key: keyTS.key, Timestamp: keyTS.ts}) } return keys }
// createRangeData creates sample range data in all possible areas of // the key space. Returns a slice of the encoded keys of all created // data. func createRangeData(r *Replica, t *testing.T) []roachpb.EncodedKey { ts0 := roachpb.ZeroTimestamp ts := roachpb.Timestamp{WallTime: 1} keyTSs := []struct { key roachpb.Key ts roachpb.Timestamp }{ {keys.ResponseCacheKey(r.Desc().RangeID, &roachpb.ClientCmdID{WallTime: 1, Random: 1}), ts0}, {keys.ResponseCacheKey(r.Desc().RangeID, &roachpb.ClientCmdID{WallTime: 2, Random: 2}), ts0}, {keys.RaftHardStateKey(r.Desc().RangeID), ts0}, {keys.RaftLogKey(r.Desc().RangeID, 1), ts0}, {keys.RaftLogKey(r.Desc().RangeID, 2), ts0}, {keys.RangeGCMetadataKey(r.Desc().RangeID), ts0}, {keys.RangeLastVerificationTimestampKey(r.Desc().RangeID), ts0}, {keys.RangeStatsKey(r.Desc().RangeID), ts0}, {keys.RangeDescriptorKey(r.Desc().StartKey), ts}, {keys.TransactionKey(roachpb.Key(r.Desc().StartKey), []byte("1234")), ts0}, {keys.TransactionKey(roachpb.Key(r.Desc().StartKey.Next()), []byte("5678")), ts0}, {keys.TransactionKey(fakePrevKey(r.Desc().EndKey), []byte("2468")), ts0}, // TODO(bdarnell): KeyMin.Next() results in a key in the reserved system-local space. // Once we have resolved https://github.com/cockroachdb/cockroach/issues/437, // replace this with something that reliably generates the first valid key in the range. //{r.Desc().StartKey.Next(), ts}, // The following line is similar to StartKey.Next() but adds more to the key to // avoid falling into the system-local space. {append(append([]byte{}, r.Desc().StartKey...), '\x01'), ts}, {fakePrevKey(r.Desc().EndKey), ts}, } keys := []roachpb.EncodedKey{} for _, keyTS := range keyTSs { if err := engine.MVCCPut(r.store.Engine(), nil, keyTS.key, keyTS.ts, roachpb.MakeValueFromString("value"), nil); err != nil { t.Fatal(err) } keys = append(keys, engine.MVCCEncodeKey(keyTS.key)) if !keyTS.ts.Equal(ts0) { keys = append(keys, engine.MVCCEncodeVersionKey(keyTS.key, keyTS.ts)) } } return keys }
// Append implements the multiraft.WriteableGroupStorage interface. func (r *Replica) Append(entries []raftpb.Entry) error { if len(entries) == 0 { return nil } batch := r.rm.Engine().NewBatch() defer batch.Close() rangeID := r.Desc().RangeID for _, ent := range entries { err := engine.MVCCPutProto(batch, nil, keys.RaftLogKey(rangeID, ent.Index), proto.ZeroTimestamp, nil, &ent) if err != nil { return err } } lastIndex := entries[len(entries)-1].Index prevLastIndex := atomic.LoadUint64(&r.lastIndex) // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(batch, nil, keys.RaftLogKey(rangeID, i), proto.ZeroTimestamp, nil) if err != nil { return err } } // Commit the batch and update the last index. if err := setLastIndex(batch, rangeID, lastIndex); err != nil { return err } if err := batch.Commit(); err != nil { return err } atomic.StoreUint64(&r.lastIndex, lastIndex) return nil }
func (r *Replica) entries(e engine.Engine, lo, hi, maxBytes uint64) ([]raftpb.Entry, error) { if lo > hi { return nil, util.Errorf("lo:%d is greater than hi:%d", lo, hi) } // Scan over the log to find the requested entries in the range [lo, hi), // stopping once we have enough. var ents []raftpb.Entry size := uint64(0) var ent raftpb.Entry expectedIndex := lo exceededMaxBytes := false scanFunc := func(kv roachpb.KeyValue) (bool, error) { if err := kv.Value.GetProto(&ent); err != nil { return false, err } // Exit early if we have any gaps or it has been compacted. if ent.Index != expectedIndex { return true, nil } expectedIndex++ size += uint64(ent.Size()) ents = append(ents, ent) exceededMaxBytes = maxBytes > 0 && size > maxBytes return exceededMaxBytes, nil } rangeID := r.RangeID _, err := engine.MVCCIterate(e, keys.RaftLogKey(rangeID, lo), keys.RaftLogKey(rangeID, hi), roachpb.ZeroTimestamp, true /* consistent */, nil /* txn */, false /* !reverse */, scanFunc) if err != nil { return nil, err } // Did the correct number of results come back? If so, we're all good. if len(ents) == int(hi)-int(lo) { return ents, nil } // Did we hit the size limit? If so, return what we have. if exceededMaxBytes { return ents, nil } // Did we get any results at all? Because something went wrong. if len(ents) > 0 { // Was the lo already truncated? if ents[0].Index > lo { return nil, raft.ErrCompacted } // Was the missing index after the last index? lastIndex, err := r.LastIndex() if err != nil { return nil, err } if lastIndex <= expectedIndex { return nil, raft.ErrUnavailable } // We have a gap in the record, if so, return a nasty error. return nil, util.Errorf("there is a gap in the index record between lo:%d and hi:%d at index:%d", lo, hi, expectedIndex) } // No results, was it due to unavailability or truncation? ts, err := r.raftTruncatedStateLocked() if err != nil { return nil, err } if ts.Index >= lo { // The requested lo index has already been truncated. return nil, raft.ErrCompacted } // The requested lo index does not yet exist. return nil, raft.ErrUnavailable }