// Entries implements the raft.Storage interface. Note that maxBytes is advisory
// and this method will always return at least one entry even if it exceeds
// maxBytes. Passing maxBytes equal to zero disables size checking.
// TODO(bdarnell): consider caching for recent entries, if rocksdb's builtin caching
// is insufficient.
func (r *Range) Entries(lo, hi, maxBytes uint64) ([]raftpb.Entry, error) {
	// Scan over the log to find the requested entries in the range [lo, hi),
	// stopping once we have enough.
	var ents []raftpb.Entry
	size := uint64(0)
	var ent raftpb.Entry
	scanFunc := func(kv proto.KeyValue) (bool, error) {
		err := gogoproto.Unmarshal(kv.Value.GetBytes(), &ent)
		if err != nil {
			return false, err
		}
		size += uint64(ent.Size())
		ents = append(ents, ent)
		return maxBytes > 0 && size > maxBytes, nil
	}

	_, err := engine.MVCCIterate(r.rm.Engine(),
		keys.RaftLogKey(r.Desc().RaftID, lo),
		keys.RaftLogKey(r.Desc().RaftID, hi),
		proto.ZeroTimestamp, true /* consistent */, nil /* txn */, scanFunc)

	if err != nil {
		return nil, err
	}

	// If neither the number of entries nor the size limitations had an
	// effect, we weren't able to supply everything the client wanted.
	if len(ents) != int(hi-lo) && (maxBytes == 0 || size < maxBytes) {
		return nil, raft.ErrUnavailable
	}

	return ents, nil
}
Example #2
0
// append the given entries to the raft log. Takes the previous value
// of r.lastIndex and returns a new value. We do this rather than
// modifying r.lastIndex directly because this modification needs to
// be atomic with the commit of the batch.
func (r *Replica) append(batch engine.Engine, prevLastIndex uint64, entries []raftpb.Entry) (uint64, error) {
	if len(entries) == 0 {
		return prevLastIndex, nil
	}
	for i := range entries {
		ent := &entries[i]
		key := keys.RaftLogKey(r.RangeID, ent.Index)
		if err := engine.MVCCPutProto(batch, nil, key, roachpb.ZeroTimestamp, nil, ent); err != nil {
			return 0, err
		}
	}
	lastIndex := entries[len(entries)-1].Index
	// Delete any previously appended log entries which never committed.
	for i := lastIndex + 1; i <= prevLastIndex; i++ {
		err := engine.MVCCDelete(batch, nil,
			keys.RaftLogKey(r.RangeID, i), roachpb.ZeroTimestamp, nil)
		if err != nil {
			return 0, err
		}
	}

	// Commit the batch and update the last index.
	if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil {
		return 0, err
	}

	return lastIndex, nil
}
// append the given entries to the raft log. Takes the previous values of
// r.mu.lastIndex and r.mu.raftLogSize, and returns new values. We do this
// rather than modifying them directly because these modifications need to be
// atomic with the commit of the batch.
func (r *Replica) append(batch engine.ReadWriter, prevLastIndex uint64, prevRaftLogSize int64, entries []raftpb.Entry) (uint64, int64, error) {
	if len(entries) == 0 {
		return prevLastIndex, prevRaftLogSize, nil
	}
	var diff enginepb.MVCCStats
	ctx := context.Background()
	for i := range entries {
		ent := &entries[i]
		key := keys.RaftLogKey(r.RangeID, ent.Index)
		if err := engine.MVCCPutProto(ctx, batch, &diff, key, hlc.ZeroTimestamp, nil /* txn */, ent); err != nil {
			return 0, 0, err
		}
	}
	lastIndex := entries[len(entries)-1].Index
	// Delete any previously appended log entries which never committed.
	for i := lastIndex + 1; i <= prevLastIndex; i++ {
		err := engine.MVCCDelete(ctx, batch, &diff, keys.RaftLogKey(r.RangeID, i),
			hlc.ZeroTimestamp, nil /* txn */)
		if err != nil {
			return 0, 0, err
		}
	}

	if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil {
		return 0, 0, err
	}

	raftLogSize := prevRaftLogSize + diff.SysBytes

	return lastIndex, raftLogSize, nil
}
Example #4
0
// InternalTruncateLog discards a prefix of the raft log.
func (r *Range) InternalTruncateLog(batch engine.Engine, ms *engine.MVCCStats, args *proto.InternalTruncateLogRequest, reply *proto.InternalTruncateLogResponse) {
	// args.Index is the first index to keep.
	term, err := r.Term(args.Index - 1)
	if err != nil {
		reply.SetGoError(err)
		return
	}
	start := keys.RaftLogKey(r.Desc().RaftID, 0)
	end := keys.RaftLogKey(r.Desc().RaftID, args.Index)
	err = batch.Iterate(engine.MVCCEncodeKey(start), engine.MVCCEncodeKey(end),
		func(kv proto.RawKeyValue) (bool, error) {
			err := batch.Clear(kv.Key)
			return false, err
		})
	if err != nil {
		reply.SetGoError(err)
		return
	}
	ts := proto.RaftTruncatedState{
		Index: args.Index - 1,
		Term:  term,
	}
	err = engine.MVCCPutProto(batch, ms, keys.RaftTruncatedStateKey(r.Desc().RaftID),
		proto.ZeroTimestamp, nil, &ts)
	reply.SetGoError(err)
}
// append the given entries to the raft log.
func (r *Replica) append(batch engine.Engine, entries []raftpb.Entry) error {
	if len(entries) == 0 {
		return nil
	}
	for _, ent := range entries {
		err := engine.MVCCPutProto(batch, nil, keys.RaftLogKey(r.RangeID, ent.Index),
			roachpb.ZeroTimestamp, nil, &ent)
		if err != nil {
			return err
		}
	}
	lastIndex := entries[len(entries)-1].Index
	prevLastIndex := atomic.LoadUint64(&r.lastIndex)
	// Delete any previously appended log entries which never committed.
	for i := lastIndex + 1; i <= prevLastIndex; i++ {
		err := engine.MVCCDelete(batch, nil,
			keys.RaftLogKey(r.RangeID, i), roachpb.ZeroTimestamp, nil)
		if err != nil {
			return err
		}
	}

	// Commit the batch and update the last index.
	if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil {
		return err
	}
	batch.Defer(func() {
		atomic.StoreUint64(&r.lastIndex, lastIndex)
	})

	return nil
}
Example #6
0
// append the given entries to the raft log. Takes the previous value
// of r.lastIndex and returns a new value. We do this rather than
// modifying r.lastIndex directly because this modification needs to
// be atomic with the commit of the batch.
func (r *Replica) append(batch engine.ReadWriter, prevLastIndex uint64, entries []raftpb.Entry) (uint64, error) {
	if len(entries) == 0 {
		return prevLastIndex, nil
	}
	for i := range entries {
		ent := &entries[i]
		key := keys.RaftLogKey(r.RangeID, ent.Index)
		if err := engine.MVCCPutProto(context.Background(), batch, nil, key, hlc.ZeroTimestamp, nil, ent); err != nil {
			return 0, err
		}
	}
	lastIndex := entries[len(entries)-1].Index
	// Delete any previously appended log entries which never committed.
	for i := lastIndex + 1; i <= prevLastIndex; i++ {
		err := engine.MVCCDelete(context.Background(), batch, nil,
			keys.RaftLogKey(r.RangeID, i), hlc.ZeroTimestamp, nil)
		if err != nil {
			return 0, err
		}
	}

	if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil {
		return 0, err
	}

	return lastIndex, nil
}
func iterateEntries(
	e engine.Reader, rangeID roachpb.RangeID, lo, hi uint64, scanFunc func(roachpb.KeyValue) (bool, error),
) error {
	_, err := engine.MVCCIterate(
		context.Background(), e,
		keys.RaftLogKey(rangeID, lo),
		keys.RaftLogKey(rangeID, hi),
		hlc.ZeroTimestamp,
		true,  /* consistent */
		nil,   /* txn */
		false, /* !reverse */
		scanFunc,
	)
	return err
}
// append the given entries to the raft log. Takes the previous values of
// r.mu.lastIndex and r.mu.raftLogSize, and returns new values. We do this
// rather than modifying them directly because these modifications need to be
// atomic with the commit of the batch.
func (r *Replica) append(
	ctx context.Context,
	batch engine.ReadWriter,
	prevLastIndex uint64,
	prevRaftLogSize int64,
	entries []raftpb.Entry,
) (uint64, int64, error) {
	if len(entries) == 0 {
		return prevLastIndex, prevRaftLogSize, nil
	}
	var diff enginepb.MVCCStats
	var value roachpb.Value
	for i := range entries {
		ent := &entries[i]
		key := keys.RaftLogKey(r.RangeID, ent.Index)
		if err := value.SetProto(ent); err != nil {
			return 0, 0, err
		}
		value.InitChecksum(key)
		var err error
		if ent.Index > prevLastIndex {
			err = engine.MVCCBlindPut(ctx, batch, &diff, key, hlc.ZeroTimestamp, value, nil /* txn */)
		} else {
			err = engine.MVCCPut(ctx, batch, &diff, key, hlc.ZeroTimestamp, value, nil /* txn */)
		}
		if err != nil {
			return 0, 0, err
		}
	}

	// Delete any previously appended log entries which never committed.
	lastIndex := entries[len(entries)-1].Index
	for i := lastIndex + 1; i <= prevLastIndex; i++ {
		err := engine.MVCCDelete(ctx, batch, &diff, keys.RaftLogKey(r.RangeID, i),
			hlc.ZeroTimestamp, nil /* txn */)
		if err != nil {
			return 0, 0, err
		}
	}

	if err := setLastIndex(ctx, batch, r.RangeID, lastIndex); err != nil {
		return 0, 0, err
	}

	raftLogSize := prevRaftLogSize + diff.SysBytes

	return lastIndex, raftLogSize, nil
}
// createRangeData creates sample range data in all possible areas of
// the key space. Returns a slice of the encoded keys of all created
// data.
func createRangeData(t *testing.T, r *Replica) []engine.MVCCKey {
	ts0 := hlc.ZeroTimestamp
	ts := hlc.Timestamp{WallTime: 1}
	desc := r.Desc()
	keyTSs := []struct {
		key roachpb.Key
		ts  hlc.Timestamp
	}{
		{keys.AbortCacheKey(r.RangeID, testTxnID), ts0},
		{keys.AbortCacheKey(r.RangeID, testTxnID2), ts0},
		{keys.RangeFrozenStatusKey(r.RangeID), ts0},
		{keys.RangeLastGCKey(r.RangeID), ts0},
		{keys.RaftAppliedIndexKey(r.RangeID), ts0},
		{keys.RaftTruncatedStateKey(r.RangeID), ts0},
		{keys.LeaseAppliedIndexKey(r.RangeID), ts0},
		{keys.RangeStatsKey(r.RangeID), ts0},
		{keys.RaftHardStateKey(r.RangeID), ts0},
		{keys.RaftLastIndexKey(r.RangeID), ts0},
		{keys.RaftLogKey(r.RangeID, 1), ts0},
		{keys.RaftLogKey(r.RangeID, 2), ts0},
		{keys.RangeLastReplicaGCTimestampKey(r.RangeID), ts0},
		{keys.RangeLastVerificationTimestampKey(r.RangeID), ts0},
		{keys.RangeDescriptorKey(desc.StartKey), ts},
		{keys.TransactionKey(roachpb.Key(desc.StartKey), uuid.NewV4()), ts0},
		{keys.TransactionKey(roachpb.Key(desc.StartKey.Next()), uuid.NewV4()), ts0},
		{keys.TransactionKey(fakePrevKey(desc.EndKey), uuid.NewV4()), ts0},
		// TODO(bdarnell): KeyMin.Next() results in a key in the reserved system-local space.
		// Once we have resolved https://github.com/cockroachdb/cockroach/issues/437,
		// replace this with something that reliably generates the first valid key in the range.
		//{r.Desc().StartKey.Next(), ts},
		// The following line is similar to StartKey.Next() but adds more to the key to
		// avoid falling into the system-local space.
		{append(append([]byte{}, desc.StartKey...), '\x02'), ts},
		{fakePrevKey(r.Desc().EndKey), ts},
	}

	keys := []engine.MVCCKey{}
	for _, keyTS := range keyTSs {
		if err := engine.MVCCPut(context.Background(), r.store.Engine(), nil, keyTS.key, keyTS.ts, roachpb.MakeValueFromString("value"), nil); err != nil {
			t.Fatal(err)
		}
		keys = append(keys, engine.MVCCKey{Key: keyTS.key, Timestamp: keyTS.ts})
	}
	return keys
}
// createRangeData creates sample range data in all possible areas of
// the key space. Returns a slice of the encoded keys of all created
// data.
func createRangeData(r *Replica, t *testing.T) []roachpb.EncodedKey {
	ts0 := roachpb.ZeroTimestamp
	ts := roachpb.Timestamp{WallTime: 1}
	keyTSs := []struct {
		key roachpb.Key
		ts  roachpb.Timestamp
	}{
		{keys.ResponseCacheKey(r.Desc().RangeID, &roachpb.ClientCmdID{WallTime: 1, Random: 1}), ts0},
		{keys.ResponseCacheKey(r.Desc().RangeID, &roachpb.ClientCmdID{WallTime: 2, Random: 2}), ts0},
		{keys.RaftHardStateKey(r.Desc().RangeID), ts0},
		{keys.RaftLogKey(r.Desc().RangeID, 1), ts0},
		{keys.RaftLogKey(r.Desc().RangeID, 2), ts0},
		{keys.RangeGCMetadataKey(r.Desc().RangeID), ts0},
		{keys.RangeLastVerificationTimestampKey(r.Desc().RangeID), ts0},
		{keys.RangeStatsKey(r.Desc().RangeID), ts0},
		{keys.RangeDescriptorKey(r.Desc().StartKey), ts},
		{keys.TransactionKey(roachpb.Key(r.Desc().StartKey), []byte("1234")), ts0},
		{keys.TransactionKey(roachpb.Key(r.Desc().StartKey.Next()), []byte("5678")), ts0},
		{keys.TransactionKey(fakePrevKey(r.Desc().EndKey), []byte("2468")), ts0},
		// TODO(bdarnell): KeyMin.Next() results in a key in the reserved system-local space.
		// Once we have resolved https://github.com/cockroachdb/cockroach/issues/437,
		// replace this with something that reliably generates the first valid key in the range.
		//{r.Desc().StartKey.Next(), ts},
		// The following line is similar to StartKey.Next() but adds more to the key to
		// avoid falling into the system-local space.
		{append(append([]byte{}, r.Desc().StartKey...), '\x01'), ts},
		{fakePrevKey(r.Desc().EndKey), ts},
	}

	keys := []roachpb.EncodedKey{}
	for _, keyTS := range keyTSs {
		if err := engine.MVCCPut(r.store.Engine(), nil, keyTS.key, keyTS.ts, roachpb.MakeValueFromString("value"), nil); err != nil {
			t.Fatal(err)
		}
		keys = append(keys, engine.MVCCEncodeKey(keyTS.key))
		if !keyTS.ts.Equal(ts0) {
			keys = append(keys, engine.MVCCEncodeVersionKey(keyTS.key, keyTS.ts))
		}
	}
	return keys
}
// Append implements the multiraft.WriteableGroupStorage interface.
func (r *Replica) Append(entries []raftpb.Entry) error {
	if len(entries) == 0 {
		return nil
	}
	batch := r.rm.Engine().NewBatch()
	defer batch.Close()

	rangeID := r.Desc().RangeID

	for _, ent := range entries {
		err := engine.MVCCPutProto(batch, nil, keys.RaftLogKey(rangeID, ent.Index),
			proto.ZeroTimestamp, nil, &ent)
		if err != nil {
			return err
		}
	}
	lastIndex := entries[len(entries)-1].Index
	prevLastIndex := atomic.LoadUint64(&r.lastIndex)
	// Delete any previously appended log entries which never committed.
	for i := lastIndex + 1; i <= prevLastIndex; i++ {
		err := engine.MVCCDelete(batch, nil,
			keys.RaftLogKey(rangeID, i), proto.ZeroTimestamp, nil)
		if err != nil {
			return err
		}
	}

	// Commit the batch and update the last index.
	if err := setLastIndex(batch, rangeID, lastIndex); err != nil {
		return err
	}
	if err := batch.Commit(); err != nil {
		return err
	}

	atomic.StoreUint64(&r.lastIndex, lastIndex)
	return nil
}
Example #12
0
func (r *Replica) entries(e engine.Engine, lo, hi, maxBytes uint64) ([]raftpb.Entry, error) {
	if lo > hi {
		return nil, util.Errorf("lo:%d is greater than hi:%d", lo, hi)
	}
	// Scan over the log to find the requested entries in the range [lo, hi),
	// stopping once we have enough.
	var ents []raftpb.Entry
	size := uint64(0)
	var ent raftpb.Entry
	expectedIndex := lo
	exceededMaxBytes := false
	scanFunc := func(kv roachpb.KeyValue) (bool, error) {
		if err := kv.Value.GetProto(&ent); err != nil {
			return false, err
		}
		// Exit early if we have any gaps or it has been compacted.
		if ent.Index != expectedIndex {
			return true, nil
		}
		expectedIndex++
		size += uint64(ent.Size())
		ents = append(ents, ent)
		exceededMaxBytes = maxBytes > 0 && size > maxBytes
		return exceededMaxBytes, nil
	}

	rangeID := r.RangeID
	_, err := engine.MVCCIterate(e,
		keys.RaftLogKey(rangeID, lo),
		keys.RaftLogKey(rangeID, hi),
		roachpb.ZeroTimestamp,
		true /* consistent */, nil /* txn */, false /* !reverse */, scanFunc)

	if err != nil {
		return nil, err
	}

	// Did the correct number of results come back? If so, we're all good.
	if len(ents) == int(hi)-int(lo) {
		return ents, nil
	}

	// Did we hit the size limit? If so, return what we have.
	if exceededMaxBytes {
		return ents, nil
	}

	// Did we get any results at all? Because something went wrong.
	if len(ents) > 0 {
		// Was the lo already truncated?
		if ents[0].Index > lo {
			return nil, raft.ErrCompacted
		}

		// Was the missing index after the last index?
		lastIndex, err := r.LastIndex()
		if err != nil {
			return nil, err
		}
		if lastIndex <= expectedIndex {
			return nil, raft.ErrUnavailable
		}

		// We have a gap in the record, if so, return a nasty error.
		return nil, util.Errorf("there is a gap in the index record between lo:%d and hi:%d at index:%d", lo, hi, expectedIndex)
	}

	// No results, was it due to unavailability or truncation?
	ts, err := r.raftTruncatedStateLocked()
	if err != nil {
		return nil, err
	}
	if ts.Index >= lo {
		// The requested lo index has already been truncated.
		return nil, raft.ErrCompacted
	}
	// The requested lo index does not yet exist.
	return nil, raft.ErrUnavailable
}