Exemplo n.º 1
0
// append the given entries to the raft log.
func (r *Replica) append(batch engine.Engine, entries []raftpb.Entry) error {
	if len(entries) == 0 {
		return nil
	}
	for _, ent := range entries {
		err := engine.MVCCPutProto(batch, nil, keys.RaftLogKey(r.RangeID, ent.Index),
			roachpb.ZeroTimestamp, nil, &ent)
		if err != nil {
			return err
		}
	}
	lastIndex := entries[len(entries)-1].Index
	prevLastIndex := atomic.LoadUint64(&r.lastIndex)
	// Delete any previously appended log entries which never committed.
	for i := lastIndex + 1; i <= prevLastIndex; i++ {
		err := engine.MVCCDelete(batch, nil,
			keys.RaftLogKey(r.RangeID, i), roachpb.ZeroTimestamp, nil)
		if err != nil {
			return err
		}
	}

	// Commit the batch and update the last index.
	if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil {
		return err
	}
	batch.Defer(func() {
		atomic.StoreUint64(&r.lastIndex, lastIndex)
	})

	return nil
}
Exemplo n.º 2
0
func newRangeDataIterator(d *proto.RangeDescriptor, e engine.Engine) *rangeDataIterator {
	// The first range in the keyspace starts at KeyMin, which includes the node-local
	// space. We need the original StartKey to find the range metadata, but the
	// actual data starts at LocalMax.
	dataStartKey := d.StartKey
	if d.StartKey.Equal(proto.KeyMin) {
		dataStartKey = keys.LocalMax
	}
	ri := &rangeDataIterator{
		ranges: []keyRange{
			{
				start: engine.MVCCEncodeKey(keys.MakeKey(keys.LocalRangeIDPrefix, encoding.EncodeUvarint(nil, uint64(d.RangeID)))),
				end:   engine.MVCCEncodeKey(keys.MakeKey(keys.LocalRangeIDPrefix, encoding.EncodeUvarint(nil, uint64(d.RangeID+1)))),
			},
			{
				start: engine.MVCCEncodeKey(keys.MakeKey(keys.LocalRangePrefix, encoding.EncodeBytes(nil, d.StartKey))),
				end:   engine.MVCCEncodeKey(keys.MakeKey(keys.LocalRangePrefix, encoding.EncodeBytes(nil, d.EndKey))),
			},
			{
				start: engine.MVCCEncodeKey(dataStartKey),
				end:   engine.MVCCEncodeKey(d.EndKey),
			},
		},
		iter: e.NewIterator(),
	}
	ri.iter.Seek(ri.ranges[ri.curIndex].start)
	ri.advance()
	return ri
}
Exemplo n.º 3
0
func newRangeDataIterator(r *Range, e engine.Engine) *rangeDataIterator {
	r.RLock()
	startKey := r.Desc().StartKey
	if startKey.Equal(engine.KeyMin) {
		startKey = engine.KeyLocalMax
	}
	endKey := r.Desc().EndKey
	r.RUnlock()
	ri := &rangeDataIterator{
		ranges: []keyRange{
			{
				start: engine.MVCCEncodeKey(engine.MakeKey(engine.KeyLocalRangeIDPrefix, encoding.EncodeUvarint(nil, uint64(r.Desc().RaftID)))),
				end:   engine.MVCCEncodeKey(engine.MakeKey(engine.KeyLocalRangeIDPrefix, encoding.EncodeUvarint(nil, uint64(r.Desc().RaftID+1)))),
			},
			{
				start: engine.MVCCEncodeKey(engine.MakeKey(engine.KeyLocalRangeKeyPrefix, encoding.EncodeBytes(nil, startKey))),
				end:   engine.MVCCEncodeKey(engine.MakeKey(engine.KeyLocalRangeKeyPrefix, encoding.EncodeBytes(nil, endKey))),
			},
			{
				start: engine.MVCCEncodeKey(startKey),
				end:   engine.MVCCEncodeKey(endKey),
			},
		},
		iter: e.NewIterator(),
	}
	ri.iter.Seek(ri.ranges[ri.curIndex].start)
	ri.advance()
	return ri
}
Exemplo n.º 4
0
// CopyFrom copies all the cached results from the originRangeID
// response cache into this one. Note that the cache will not be
// locked while copying is in progress. Failures decoding individual
// cache entries return an error. The copy is done directly using the
// engine instead of interpreting values through MVCC for efficiency.
func (rc *ResponseCache) CopyFrom(e engine.Engine, originRangeID proto.RangeID) error {
	prefix := keys.ResponseCacheKey(originRangeID, nil) // response cache prefix
	start := engine.MVCCEncodeKey(prefix)
	end := engine.MVCCEncodeKey(prefix.PrefixEnd())

	return e.Iterate(start, end, func(kv proto.RawKeyValue) (bool, error) {
		// Decode the key into a cmd, skipping on error. Otherwise,
		// write it to the corresponding key in the new cache.
		cmdID, err := rc.decodeResponseCacheKey(kv.Key)
		if err != nil {
			return false, util.Errorf("could not decode a response cache key %s: %s",
				proto.Key(kv.Key), err)
		}
		key := keys.ResponseCacheKey(rc.rangeID, &cmdID)
		encKey := engine.MVCCEncodeKey(key)
		// Decode the value, update the checksum and re-encode.
		meta := &engine.MVCCMetadata{}
		if err := gogoproto.Unmarshal(kv.Value, meta); err != nil {
			return false, util.Errorf("could not decode response cache value %s [% x]: %s",
				proto.Key(kv.Key), kv.Value, err)
		}
		meta.Value.Checksum = nil
		meta.Value.InitChecksum(key)
		_, _, err = engine.PutProto(e, encKey, meta)
		return false, err
	})
}
Exemplo n.º 5
0
func newReplicaDataIterator(d *roachpb.RangeDescriptor, e engine.Engine) *replicaDataIterator {
	// The first range in the keyspace starts at KeyMin, which includes the node-local
	// space. We need the original StartKey to find the range metadata, but the
	// actual data starts at LocalMax.
	dataStartKey := d.StartKey.AsRawKey()
	if d.StartKey.Equal(roachpb.RKeyMin) {
		dataStartKey = keys.LocalMax
	}
	ri := &replicaDataIterator{
		ranges: []keyRange{
			{
				start: engine.MVCCEncodeKey(keys.MakeRangeIDPrefix(d.RangeID)),
				end:   engine.MVCCEncodeKey(keys.MakeRangeIDPrefix(d.RangeID + 1)),
			},
			{
				start: engine.MVCCEncodeKey(keys.MakeRangeKeyPrefix(d.StartKey)),
				end:   engine.MVCCEncodeKey(keys.MakeRangeKeyPrefix(d.EndKey)),
			},
			{
				start: engine.MVCCEncodeKey(dataStartKey),
				end:   engine.MVCCEncodeKey(d.EndKey.AsRawKey()),
			},
		},
		iter: e.NewIterator(),
	}
	ri.iter.Seek(ri.ranges[ri.curIndex].start)
	ri.advance()
	return ri
}
Exemplo n.º 6
0
// CopyInto copies all the cached results from this response cache
// into the destRangeID response cache. Failures decoding individual
// cache entries return an error.
func (rc *ResponseCache) CopyInto(e engine.Engine, destRangeID roachpb.RangeID) error {
	start := engine.MVCCEncodeKey(
		keys.ResponseCacheKey(rc.rangeID, roachpb.KeyMin))
	end := engine.MVCCEncodeKey(
		keys.ResponseCacheKey(rc.rangeID, roachpb.KeyMax))

	return e.Iterate(start, end, func(kv engine.MVCCKeyValue) (bool, error) {
		// Decode the key into a cmd, skipping on error. Otherwise,
		// write it to the corresponding key in the new cache.
		family, err := rc.decodeResponseCacheKey(kv.Key)
		if err != nil {
			return false, util.Errorf("could not decode a response cache key %s: %s",
				roachpb.Key(kv.Key), err)
		}
		key := keys.ResponseCacheKey(destRangeID, family)
		encKey := engine.MVCCEncodeKey(key)
		// Decode the value, update the checksum and re-encode.
		meta := &engine.MVCCMetadata{}
		if err := proto.Unmarshal(kv.Value, meta); err != nil {
			return false, util.Errorf("could not decode response cache value %s [% x]: %s",
				roachpb.Key(kv.Key), kv.Value, err)
		}
		meta.Value.Checksum = nil
		meta.Value.InitChecksum(key)
		_, _, err = engine.PutProto(e, encKey, meta)
		return false, err
	})
}
Exemplo n.º 7
0
// InternalTruncateLog discards a prefix of the raft log.
func (r *Range) InternalTruncateLog(batch engine.Engine, ms *engine.MVCCStats, args *proto.InternalTruncateLogRequest, reply *proto.InternalTruncateLogResponse) {
	// args.Index is the first index to keep.
	term, err := r.Term(args.Index - 1)
	if err != nil {
		reply.SetGoError(err)
		return
	}
	start := keys.RaftLogKey(r.Desc().RaftID, 0)
	end := keys.RaftLogKey(r.Desc().RaftID, args.Index)
	err = batch.Iterate(engine.MVCCEncodeKey(start), engine.MVCCEncodeKey(end),
		func(kv proto.RawKeyValue) (bool, error) {
			err := batch.Clear(kv.Key)
			return false, err
		})
	if err != nil {
		reply.SetGoError(err)
		return
	}
	ts := proto.RaftTruncatedState{
		Index: args.Index - 1,
		Term:  term,
	}
	err = engine.MVCCPutProto(batch, ms, keys.RaftTruncatedStateKey(r.Desc().RaftID),
		proto.ZeroTimestamp, nil, &ts)
	reply.SetGoError(err)
}
Exemplo n.º 8
0
func loadRangeDescriptor(
	db engine.Engine, rangeID roachpb.RangeID,
) (roachpb.RangeDescriptor, error) {
	var desc roachpb.RangeDescriptor
	handleKV := func(kv engine.MVCCKeyValue) (bool, error) {
		if kv.Key.Timestamp == hlc.ZeroTimestamp {
			// We only want values, not MVCCMetadata.
			return false, nil
		}
		if err := checkRangeDescriptorKey(kv.Key); err != nil {
			// Range descriptor keys are interleaved with others, so if it
			// doesn't parse as a range descriptor just skip it.
			return false, nil
		}
		if err := getProtoValue(kv.Value, &desc); err != nil {
			return false, err
		}
		return desc.RangeID == rangeID, nil
	}

	// Range descriptors are stored by key, so we have to scan over the
	// range-local data to find the one for this RangeID.
	start := engine.MakeMVCCMetadataKey(keys.LocalRangePrefix)
	end := engine.MakeMVCCMetadataKey(keys.LocalRangeMax)

	if err := db.Iterate(start, end, handleKV); err != nil {
		return roachpb.RangeDescriptor{}, err
	}
	if desc.RangeID == rangeID {
		return desc, nil
	}
	return roachpb.RangeDescriptor{}, fmt.Errorf("range descriptor %d not found", rangeID)
}
Exemplo n.º 9
0
func copySeqCache(e engine.Engine, srcID, dstID roachpb.RangeID, keyMin, keyMax engine.MVCCKey) error {
	var scratch [64]byte
	return e.Iterate(keyMin, keyMax,
		func(kv engine.MVCCKeyValue) (bool, error) {
			// Decode the key into a cmd, skipping on error. Otherwise,
			// write it to the corresponding key in the new cache.
			id, epoch, seq, err := decodeSequenceCacheMVCCKey(kv.Key, scratch[:0])
			if err != nil {
				return false, util.Errorf("could not decode a sequence cache key %s: %s",
					kv.Key, err)
			}
			key := keys.SequenceCacheKey(dstID, id, epoch, seq)
			encKey := engine.MakeMVCCMetadataKey(key)
			// Decode the value, update the checksum and re-encode.
			meta := &engine.MVCCMetadata{}
			if err := proto.Unmarshal(kv.Value, meta); err != nil {
				return false, util.Errorf("could not decode sequence cache value %s [% x]: %s",
					kv.Key, kv.Value, err)
			}
			value := meta.Value()
			value.ClearChecksum()
			value.InitChecksum(key)
			meta.RawBytes = value.RawBytes
			_, _, err = engine.PutProto(e, encKey, meta)
			return false, err
		})
}
Exemplo n.º 10
0
func newReplicaDataIterator(d *roachpb.RangeDescriptor, e engine.Engine) *replicaDataIterator {
	ri := &replicaDataIterator{
		ranges:   makeReplicaKeyRanges(d),
		Iterator: e.NewIterator(false),
	}
	ri.Seek(ri.ranges[ri.curIndex].start)
	return ri
}
Exemplo n.º 11
0
// AddStore creates a new store on the same Transport but doesn't create any ranges.
func (m *multiTestContext) addStore() {
	idx := len(m.stores)
	var clock *hlc.Clock
	if len(m.clocks) > idx {
		clock = m.clocks[idx]
	} else {
		clock = m.clock
		m.clocks = append(m.clocks, clock)
	}
	var eng engine.Engine
	var needBootstrap bool
	if len(m.engines) > idx {
		eng = m.engines[idx]
	} else {
		eng = engine.NewInMem(proto.Attributes{}, 1<<20)
		m.engines = append(m.engines, eng)
		needBootstrap = true
		// Add an extra refcount to the engine so the underlying rocksdb instances
		// aren't closed when stopping and restarting the stores.
		// These refcounts are removed in Stop().
		if err := eng.Open(); err != nil {
			m.t.Fatal(err)
		}
	}

	stopper := stop.NewStopper()
	ctx := m.makeContext(idx)
	store := storage.NewStore(ctx, eng, &proto.NodeDescriptor{NodeID: proto.NodeID(idx + 1)})
	if needBootstrap {
		err := store.Bootstrap(proto.StoreIdent{
			NodeID:  proto.NodeID(idx + 1),
			StoreID: proto.StoreID(idx + 1),
		}, stopper)
		if err != nil {
			m.t.Fatal(err)
		}

		// Bootstrap the initial range on the first store
		if idx == 0 {
			if err := store.BootstrapRange(nil); err != nil {
				m.t.Fatal(err)
			}
		}
	}
	if err := store.Start(stopper); err != nil {
		m.t.Fatal(err)
	}
	store.WaitForInit()
	m.stores = append(m.stores, store)
	if len(m.senders) == idx {
		m.senders = append(m.senders, kv.NewLocalSender())
	}
	m.senders[idx].AddStore(store)
	// Save the store identities for later so we can use them in
	// replication operations even while the store is stopped.
	m.idents = append(m.idents, store.Ident)
	m.stoppers = append(m.stoppers, stopper)
}
Exemplo n.º 12
0
// ClearData removes all persisted items stored in the cache.
func (sc *AbortCache) ClearData(e engine.Engine) error {
	b := e.NewBatch()
	defer b.Close()
	_, err := engine.ClearRange(b, engine.MakeMVCCMetadataKey(sc.min()), engine.MakeMVCCMetadataKey(sc.max()))
	if err != nil {
		return err
	}
	return b.Commit()
}
Exemplo n.º 13
0
func newReplicaDataIterator(d *roachpb.RangeDescriptor, e engine.Engine, replicatedOnly bool) *replicaDataIterator {
	rangeFunc := makeAllKeyRanges
	if replicatedOnly {
		rangeFunc = makeReplicatedKeyRanges
	}
	ri := &replicaDataIterator{
		ranges:   rangeFunc(d),
		Iterator: e.NewIterator(nil),
	}
	ri.Seek(ri.ranges[ri.curIndex].start)
	return ri
}
Exemplo n.º 14
0
// ComputeStatsForRange computes the stats for a given range by
// iterating over all key ranges for the given range that should
// be accounted for in its stats.
func ComputeStatsForRange(d *roachpb.RangeDescriptor, e engine.Engine, nowNanos int64) (engine.MVCCStats, error) {
	iter := e.NewIterator(nil)
	defer iter.Close()

	ms := engine.MVCCStats{}
	for _, r := range makeReplicatedKeyRanges(d) {
		msDelta, err := iter.ComputeStats(r.start, r.end, nowNanos)
		if err != nil {
			return engine.MVCCStats{}, err
		}
		ms.Add(msDelta)
	}
	return ms, nil
}
Exemplo n.º 15
0
// mergeTrigger is called on a successful commit of an AdminMerge
// transaction. It recomputes stats for the receiving range.
func (r *Range) mergeTrigger(batch engine.Engine, merge *proto.MergeTrigger) error {
	if !bytes.Equal(r.Desc().StartKey, merge.UpdatedDesc.StartKey) {
		return util.Errorf("range and updated range start keys do not match: %s != %s",
			r.Desc().StartKey, merge.UpdatedDesc.StartKey)
	}

	if !r.Desc().EndKey.Less(merge.UpdatedDesc.EndKey) {
		return util.Errorf("range end key is not less than the post merge end key: %s >= %s",
			r.Desc().EndKey, merge.UpdatedDesc.EndKey)
	}

	if merge.SubsumedRaftID <= 0 {
		return util.Errorf("subsumed raft ID must be provided: %d", merge.SubsumedRaftID)
	}

	// Copy the subsumed range's response cache to the subsuming one.
	if err := r.respCache.CopyFrom(batch, merge.SubsumedRaftID); err != nil {
		return util.Errorf("unable to copy response cache to new split range: %s", err)
	}

	// Compute stats for updated range.
	now := r.rm.Clock().Timestamp()
	iter := newRangeDataIterator(&merge.UpdatedDesc, batch)
	ms, err := engine.MVCCComputeStats(iter, now.WallTime)
	iter.Close()
	if err != nil {
		return util.Errorf("unable to compute stats for the range after merge: %s", err)
	}
	if err = r.stats.SetMVCCStats(batch, ms); err != nil {
		return util.Errorf("unable to write MVCC stats: %s", err)
	}

	// Clear the timestamp cache. In the case that this replica and the
	// subsumed replica each held their respective leader leases, we
	// could merge the timestamp caches for efficiency. But it's unlikely
	// and not worth the extra logic and potential for error.
	r.Lock()
	r.tsCache.Clear(r.rm.Clock())
	r.Unlock()

	batch.Defer(func() {
		if err := r.rm.MergeRange(r, merge.UpdatedDesc.EndKey, merge.SubsumedRaftID); err != nil {
			// Our in-memory state has diverged from the on-disk state.
			log.Fatalf("failed to update store after merging range: %s", err)
		}
	})
	return nil
}
Exemplo n.º 16
0
// CopyFrom copies all the cached results from another response cache
// into this one. Note that the cache will not be locked while copying
// is in progress. Failures decoding individual cache entries return an
// error. The copy is done directly using the engine instead of interpreting
// values through MVCC for efficiency.
func (rc *ResponseCache) CopyFrom(e engine.Engine, originRaftID int64) error {
	prefix := engine.ResponseCacheKey(originRaftID, nil) // response cache prefix
	start := engine.MVCCEncodeKey(prefix)
	end := engine.MVCCEncodeKey(prefix.PrefixEnd())

	return e.Iterate(start, end, func(kv proto.RawKeyValue) (bool, error) {
		// Decode the key into a cmd, skipping on error. Otherwise,
		// write it to the corresponding key in the new cache.
		cmdID, err := rc.decodeResponseCacheKey(kv.Key)
		if err != nil {
			return false, util.Errorf("could not decode a response cache key %q: %s", kv.Key, err)
		}
		encKey := engine.MVCCEncodeKey(engine.ResponseCacheKey(rc.raftID, &cmdID))
		return false, rc.engine.Put(encKey, kv.Value)
	})
}
func verifyCleanup(key proto.Key, coord *TxnCoordSender, eng engine.Engine, t *testing.T) {
	if len(coord.txns) != 0 {
		t.Errorf("expected empty transactions map; got %d", len(coord.txns))
	}

	if err := util.IsTrueWithin(func() bool {
		meta := &engine.MVCCMetadata{}
		ok, _, _, err := eng.GetProto(engine.MVCCEncodeKey(key), meta)
		if err != nil {
			t.Errorf("error getting MVCC metadata: %s", err)
		}
		return !ok || meta.Txn == nil
	}, 500*time.Millisecond); err != nil {
		t.Errorf("expected intents to be cleaned up within 500ms")
	}
}
Exemplo n.º 18
0
func copySeqCache(
	e engine.Engine,
	ms *engine.MVCCStats,
	srcID, dstID roachpb.RangeID,
	keyMin, keyMax engine.MVCCKey,
) (int, error) {
	var scratch [64]byte
	var count int
	var meta engine.MVCCMetadata
	// TODO(spencer): look into making this an MVCCIteration and writing
	// the values using MVCC so we can avoid the ugliness of updating
	// the MVCCStats by hand below.
	err := e.Iterate(keyMin, keyMax,
		func(kv engine.MVCCKeyValue) (bool, error) {
			// Decode the key, skipping on error. Otherwise, write it to the
			// corresponding key in the new cache.
			txnID, err := decodeAbortCacheMVCCKey(kv.Key, scratch[:0])
			if err != nil {
				return false, util.Errorf("could not decode an abort cache key %s: %s", kv.Key, err)
			}
			key := keys.AbortCacheKey(dstID, txnID)
			encKey := engine.MakeMVCCMetadataKey(key)
			// Decode the MVCCMetadata value.
			if err := proto.Unmarshal(kv.Value, &meta); err != nil {
				return false, util.Errorf("could not decode mvcc metadata %s [% x]: %s", kv.Key, kv.Value, err)
			}
			value := meta.Value()
			value.ClearChecksum()
			value.InitChecksum(key)
			meta.RawBytes = value.RawBytes

			keyBytes, valBytes, err := engine.PutProto(e, encKey, &meta)
			if err != nil {
				return false, err
			}
			count++
			if ms != nil {
				ms.SysBytes += keyBytes + valBytes
				ms.SysCount++
			}
			return false, nil
		})
	return count, err
}
Exemplo n.º 19
0
func verifyCleanup(key proto.Key, coord *TxnCoordSender, eng engine.Engine, t *testing.T) {
	util.SucceedsWithin(t, 500*time.Millisecond, func() error {
		coord.Lock()
		l := len(coord.txns)
		coord.Unlock()
		if l != 0 {
			return fmt.Errorf("expected empty transactions map; got %d", l)
		}
		meta := &engine.MVCCMetadata{}
		ok, _, _, err := eng.GetProto(engine.MVCCEncodeKey(key), meta)
		if err != nil {
			return fmt.Errorf("error getting MVCC metadata: %s", err)
		}
		if !ok || meta.Txn == nil {
			return nil
		}
		return errors.New("intents not cleaned up")
	})
}
Exemplo n.º 20
0
func verifyCleanup(key roachpb.Key, coord *TxnCoordSender, eng engine.Engine, t *testing.T) {
	util.SucceedsWithin(t, 500*time.Millisecond, func() error {
		coord.Lock()
		l := len(coord.txns)
		coord.Unlock()
		if l != 0 {
			return fmt.Errorf("expected empty transactions map; got %d", l)
		}
		meta := &engine.MVCCMetadata{}
		ok, _, _, err := eng.GetProto(engine.MakeMVCCMetadataKey(key), meta)
		if err != nil {
			return fmt.Errorf("error getting MVCC metadata: %s", err)
		}
		if ok && meta.Txn != nil {
			return fmt.Errorf("found unexpected write intent: %s", meta)
		}
		return nil
	})
}
Exemplo n.º 21
0
// applySnapshot updates the replica based on the given snapshot.
// Returns the new last index.
func (r *Replica) applySnapshot(batch engine.Engine, snap raftpb.Snapshot) (uint64, error) {
	snapData := roachpb.RaftSnapshotData{}
	err := proto.Unmarshal(snap.Data, &snapData)
	if err != nil {
		return 0, err
	}

	rangeID := r.RangeID

	// First, save the HardState. The HardState must not be changed
	// because it may record a previous vote cast by this node. This is
	// usually unnecessary because a snapshot is nearly always
	// accompanied by a new HardState which incorporates both our former
	// state and new information from the leader, but in the event that
	// the HardState has not changed, we want to use our own previous
	// HardState and not one that was transmitted via the snapshot.
	hardStateKey := keys.RaftHardStateKey(rangeID)
	hardState, _, err := engine.MVCCGet(batch, hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil)
	if err != nil {
		return 0, err
	}

	// Extract the updated range descriptor.
	desc := snapData.RangeDescriptor

	// Delete everything in the range and recreate it from the snapshot.
	// We need to delete any old Raft log entries here because any log entries
	// that predate the snapshot will be orphaned and never truncated or GC'd.
	iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */)
	defer iter.Close()
	for ; iter.Valid(); iter.Next() {
		if err := batch.Clear(iter.Key()); err != nil {
			return 0, err
		}
	}

	// Determine the unreplicated key prefix so we can drop any
	// unreplicated keys from the snapshot.
	unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID)

	// Write the snapshot into the range.
	for _, kv := range snapData.KV {
		if bytes.HasPrefix(kv.Key, unreplicatedPrefix) {
			continue
		}
		mvccKey := engine.MVCCKey{
			Key:       kv.Key,
			Timestamp: kv.Timestamp,
		}
		if err := batch.Put(mvccKey, kv.Value); err != nil {
			return 0, err
		}
	}

	// Write the snapshot's Raft log into the range.
	if _, err := r.append(batch, 0, snapData.LogEntries); err != nil {
		return 0, err
	}

	// Restore the saved HardState.
	if hardState == nil {
		err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil)
		if err != nil {
			return 0, err
		}
	} else {
		err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil)
		if err != nil {
			return 0, err
		}
	}

	// Read the leader lease.
	lease, err := loadLeaderLease(batch, desc.RangeID)
	if err != nil {
		return 0, err
	}

	// Load updated range stats. The local newStats variable will be assigned
	// to r.stats after the batch commits.
	newStats, err := newRangeStats(desc.RangeID, batch)
	if err != nil {
		return 0, err
	}

	// The next line sets the persisted last index to the last applied index.
	// This is not a correctness issue, but means that we may have just
	// transferred some entries we're about to re-request from the leader and
	// overwrite.
	// However, raft.MultiNode currently expects this behaviour, and the
	// performance implications are not likely to be drastic. If our feelings
	// about this ever change, we can add a LastIndex field to
	// raftpb.SnapshotMetadata.
	if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil {
		return 0, err
	}

	batch.Defer(func() {
		// Update the range stats.
		r.stats.Replace(newStats)

		r.mu.Lock()
		// As outlined above, last and applied index are the same after applying
		// the snapshot.
		r.mu.appliedIndex = snap.Metadata.Index
		r.mu.leaderLease = lease
		r.mu.Unlock()

		// Update other fields which are uninitialized or need updating.
		// This may not happen if the system config has not yet been loaded.
		// While config update will correctly set the fields, there is no order
		// guarantee in ApplySnapshot.
		// TODO: should go through the standard store lock when adding a replica.
		if err := r.updateRangeInfo(&desc); err != nil {
			panic(err)
		}

		// Update the range descriptor. This is done last as this is the step that
		// makes the Replica visible in the Store.
		if err := r.setDesc(&desc); err != nil {
			panic(err)
		}
	})
	return snap.Metadata.Index, nil
}
Exemplo n.º 22
0
// splitTrigger is called on a successful commit of an AdminSplit
// transaction. It copies the response cache for the new range and
// recomputes stats for both the existing, updated range and the new
// range.
func (r *Range) splitTrigger(batch engine.Engine, split *proto.SplitTrigger) error {
	if !bytes.Equal(r.Desc().StartKey, split.UpdatedDesc.StartKey) ||
		!bytes.Equal(r.Desc().EndKey, split.NewDesc.EndKey) {
		return util.Errorf("range does not match splits: (%s-%s) + (%s-%s) != %s",
			split.UpdatedDesc.StartKey, split.UpdatedDesc.EndKey,
			split.NewDesc.StartKey, split.NewDesc.EndKey, r)
	}

	// Copy the GC metadata.
	gcMeta, err := r.GetGCMetadata()
	if err != nil {
		return util.Errorf("unable to fetch GC metadata: %s", err)
	}
	if err := engine.MVCCPutProto(batch, nil, keys.RangeGCMetadataKey(split.NewDesc.RaftID), proto.ZeroTimestamp, nil, gcMeta); err != nil {
		return util.Errorf("unable to copy GC metadata: %s", err)
	}

	// Copy the last verification timestamp.
	verifyTS, err := r.GetLastVerificationTimestamp()
	if err != nil {
		return util.Errorf("unable to fetch last verification timestamp: %s", err)
	}
	if err := engine.MVCCPutProto(batch, nil, keys.RangeLastVerificationTimestampKey(split.NewDesc.RaftID), proto.ZeroTimestamp, nil, &verifyTS); err != nil {
		return util.Errorf("unable to copy last verification timestamp: %s", err)
	}

	// Compute stats for updated range.
	now := r.rm.Clock().Timestamp()
	iter := newRangeDataIterator(&split.UpdatedDesc, batch)
	ms, err := engine.MVCCComputeStats(iter, now.WallTime)
	iter.Close()
	if err != nil {
		return util.Errorf("unable to compute stats for updated range after split: %s", err)
	}
	if err := r.stats.SetMVCCStats(batch, ms); err != nil {
		return util.Errorf("unable to write MVCC stats: %s", err)
	}

	// Initialize the new range's response cache by copying the original's.
	if err = r.respCache.CopyInto(batch, split.NewDesc.RaftID); err != nil {
		return util.Errorf("unable to copy response cache to new split range: %s", err)
	}

	// Add the new split range to the store. This step atomically
	// updates the EndKey of the updated range and also adds the
	// new range to the store's range map.
	newRng, err := NewRange(&split.NewDesc, r.rm)
	if err != nil {
		return err
	}

	// Compute stats for new range.
	iter = newRangeDataIterator(&split.NewDesc, batch)
	ms, err = engine.MVCCComputeStats(iter, now.WallTime)
	iter.Close()
	if err != nil {
		return util.Errorf("unable to compute stats for new range after split: %s", err)
	}
	if err = newRng.stats.SetMVCCStats(batch, ms); err != nil {
		return util.Errorf("unable to write MVCC stats: %s", err)
	}

	// Copy the timestamp cache into the new range.
	r.Lock()
	r.tsCache.MergeInto(newRng.tsCache, true /* clear */)
	r.Unlock()

	batch.Defer(func() {
		if err := r.rm.SplitRange(r, newRng); err != nil {
			// Our in-memory state has diverged from the on-disk state.
			log.Fatalf("failed to update Store after split: %s", err)
		}
	})

	return nil
}