Exemplo n.º 1
0
// applySnapshot updates the replica based on the given snapshot.
// Returns the new last index.
func (r *Replica) applySnapshot(batch engine.Engine, snap raftpb.Snapshot) (uint64, error) {
	snapData := roachpb.RaftSnapshotData{}
	err := proto.Unmarshal(snap.Data, &snapData)
	if err != nil {
		return 0, err
	}

	rangeID := r.RangeID

	// First, save the HardState. The HardState must not be changed
	// because it may record a previous vote cast by this node. This is
	// usually unnecessary because a snapshot is nearly always
	// accompanied by a new HardState which incorporates both our former
	// state and new information from the leader, but in the event that
	// the HardState has not changed, we want to use our own previous
	// HardState and not one that was transmitted via the snapshot.
	hardStateKey := keys.RaftHardStateKey(rangeID)
	hardState, _, err := engine.MVCCGet(batch, hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil)
	if err != nil {
		return 0, err
	}

	// Extract the updated range descriptor.
	desc := snapData.RangeDescriptor

	// Delete everything in the range and recreate it from the snapshot.
	// We need to delete any old Raft log entries here because any log entries
	// that predate the snapshot will be orphaned and never truncated or GC'd.
	iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */)
	defer iter.Close()
	for ; iter.Valid(); iter.Next() {
		if err := batch.Clear(iter.Key()); err != nil {
			return 0, err
		}
	}

	// Determine the unreplicated key prefix so we can drop any
	// unreplicated keys from the snapshot.
	unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID)

	// Write the snapshot into the range.
	for _, kv := range snapData.KV {
		if bytes.HasPrefix(kv.Key, unreplicatedPrefix) {
			continue
		}
		mvccKey := engine.MVCCKey{
			Key:       kv.Key,
			Timestamp: kv.Timestamp,
		}
		if err := batch.Put(mvccKey, kv.Value); err != nil {
			return 0, err
		}
	}

	// Write the snapshot's Raft log into the range.
	if _, err := r.append(batch, 0, snapData.LogEntries); err != nil {
		return 0, err
	}

	// Restore the saved HardState.
	if hardState == nil {
		err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil)
		if err != nil {
			return 0, err
		}
	} else {
		err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil)
		if err != nil {
			return 0, err
		}
	}

	// Read the leader lease.
	lease, err := loadLeaderLease(batch, desc.RangeID)
	if err != nil {
		return 0, err
	}

	// Load updated range stats. The local newStats variable will be assigned
	// to r.stats after the batch commits.
	newStats, err := newRangeStats(desc.RangeID, batch)
	if err != nil {
		return 0, err
	}

	// The next line sets the persisted last index to the last applied index.
	// This is not a correctness issue, but means that we may have just
	// transferred some entries we're about to re-request from the leader and
	// overwrite.
	// However, raft.MultiNode currently expects this behaviour, and the
	// performance implications are not likely to be drastic. If our feelings
	// about this ever change, we can add a LastIndex field to
	// raftpb.SnapshotMetadata.
	if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil {
		return 0, err
	}

	batch.Defer(func() {
		// Update the range stats.
		r.stats.Replace(newStats)

		r.mu.Lock()
		// As outlined above, last and applied index are the same after applying
		// the snapshot.
		r.mu.appliedIndex = snap.Metadata.Index
		r.mu.leaderLease = lease
		r.mu.Unlock()

		// Update other fields which are uninitialized or need updating.
		// This may not happen if the system config has not yet been loaded.
		// While config update will correctly set the fields, there is no order
		// guarantee in ApplySnapshot.
		// TODO: should go through the standard store lock when adding a replica.
		if err := r.updateRangeInfo(&desc); err != nil {
			panic(err)
		}

		// Update the range descriptor. This is done last as this is the step that
		// makes the Replica visible in the Store.
		if err := r.setDesc(&desc); err != nil {
			panic(err)
		}
	})
	return snap.Metadata.Index, nil
}
Exemplo n.º 2
0
// applySnapshot updates the replica based on the given snapshot and associated
// HardState (which may be empty, as Raft may apply some snapshots which don't
// require an update to the HardState). All snapshots must pass through Raft
// for correctness, i.e. the parameters to this method must be taken from
// a raft.Ready. It is the caller's responsibility to call
// r.store.processRangeDescriptorUpdate(r) after a successful applySnapshot.
func (r *Replica) applySnapshot(
	ctx context.Context, snap raftpb.Snapshot, hs raftpb.HardState,
) error {
	// We use a separate batch to apply the snapshot since the Replica (and in
	// particular the last index) is updated after the batch commits. Using a
	// separate batch also allows for future optimization (such as using a
	// Distinct() batch).
	batch := r.store.Engine().NewBatch()
	defer batch.Close()

	snapData := roachpb.RaftSnapshotData{}
	err := proto.Unmarshal(snap.Data, &snapData)
	if err != nil {
		return err
	}

	// Extract the updated range descriptor.
	desc := snapData.RangeDescriptor
	// Fill the reservation if there was one for this range, regardless of
	// whether the application succeeded.
	defer r.store.bookie.Fill(desc.RangeID)

	r.mu.Lock()
	replicaID := r.mu.replicaID
	raftLogSize := r.mu.raftLogSize
	r.mu.Unlock()

	isPreemptive := replicaID == 0 // only used for accounting and log format

	replicaIDStr := "[?]"
	snapType := "preemptive"
	if !isPreemptive {
		replicaIDStr = strconv.FormatInt(int64(replicaID), 10)
		snapType = "Raft"
	}

	log.Infof(ctx, "%s: with replicaID %s, applying %s snapshot at index %d "+
		"(encoded size=%d, %d KV pairs, %d log entries)",
		r, replicaIDStr, snapType, snap.Metadata.Index,
		len(snap.Data), len(snapData.KV), len(snapData.LogEntries))
	defer func(start time.Time) {
		log.Infof(ctx, "%s: with replicaID %s, applied %s snapshot in %.3fs",
			r, replicaIDStr, snapType, timeutil.Since(start).Seconds())
	}(timeutil.Now())

	// Delete everything in the range and recreate it from the snapshot.
	// We need to delete any old Raft log entries here because any log entries
	// that predate the snapshot will be orphaned and never truncated or GC'd.
	//
	// The KVs and log entries are all written to distinct keys so we can use a
	// distinct batch. Note that we clear keys here that are potentially
	// overwritten below, which violates the spirit of the distinct batch. This
	// is safe because we don't do any reads until after the distinct batch is
	// closed (the raft log writes are "blind").
	distinctBatch := batch.Distinct()
	iter := NewReplicaDataIterator(&desc, distinctBatch, false /* !replicatedOnly */)
	defer iter.Close()
	for ; iter.Valid(); iter.Next() {
		if err := distinctBatch.Clear(iter.Key()); err != nil {
			return err
		}
	}

	// Determine the unreplicated key prefix so we can drop any
	// unreplicated keys from the snapshot.
	unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID)

	// Write the snapshot into the range.
	for _, kv := range snapData.KV {
		if bytes.HasPrefix(kv.Key, unreplicatedPrefix) {
			continue
		}
		mvccKey := engine.MVCCKey{
			Key:       kv.Key,
			Timestamp: kv.Timestamp,
		}
		if err := distinctBatch.Put(mvccKey, kv.Value); err != nil {
			return err
		}
	}

	logEntries := make([]raftpb.Entry, len(snapData.LogEntries))
	for i, bytes := range snapData.LogEntries {
		if err := logEntries[i].Unmarshal(bytes); err != nil {
			return err
		}
	}

	// Write the snapshot's Raft log into the range.
	_, raftLogSize, err = r.append(ctx, distinctBatch, 0, raftLogSize, logEntries)
	if err != nil {
		return err
	}

	if !raft.IsEmptyHardState(hs) {
		if err := setHardState(ctx, distinctBatch, r.RangeID, hs); err != nil {
			return errors.Wrapf(err, "unable to persist HardState %+v", &hs)
		}
	} else {
		// Note that we don't require that Raft supply us with a nonempty
		// HardState on a snapshot. We don't want to make that assumption
		// because it's not guaranteed by the contract. Raft *must* send us
		// a HardState when it increases the committed index as a result of the
		// snapshot, but who is to say it isn't going to accept a snapshot
		// which is identical to the current state?
	}

	// We need to close the distinct batch and start using the normal batch for
	// the read below.
	distinctBatch.Close()

	s, err := loadState(ctx, batch, &desc)
	if err != nil {
		return err
	}

	if s.Desc.RangeID != r.RangeID {
		log.Fatalf(ctx, "%s: unexpected range ID %d", r, s.Desc.RangeID)
	}

	// As outlined above, last and applied index are the same after applying
	// the snapshot (i.e. the snapshot has no uncommitted tail).
	if s.RaftAppliedIndex != snap.Metadata.Index {
		log.Fatalf(ctx, "%s: with state loaded from %d: snapshot resulted in appliedIndex=%d, metadataIndex=%d",
			r, s.Desc.RangeID, s.RaftAppliedIndex, snap.Metadata.Index)
	}

	if err := batch.Commit(); err != nil {
		return err
	}

	r.mu.Lock()
	// We set the persisted last index to the last applied index. This is
	// not a correctness issue, but means that we may have just transferred
	// some entries we're about to re-request from the leader and overwrite.
	// However, raft.MultiNode currently expects this behaviour, and the
	// performance implications are not likely to be drastic. If our
	// feelings about this ever change, we can add a LastIndex field to
	// raftpb.SnapshotMetadata.
	r.mu.lastIndex = s.RaftAppliedIndex
	r.mu.raftLogSize = raftLogSize
	// Update the range and store stats.
	r.store.metrics.subtractMVCCStats(r.mu.state.Stats)
	r.store.metrics.addMVCCStats(s.Stats)
	r.mu.state = s
	r.assertStateLocked(r.store.Engine())
	r.mu.Unlock()

	// As the last deferred action after committing the batch, update other
	// fields which are uninitialized or need updating. This may not happen
	// if the system config has not yet been loaded. While config update
	// will correctly set the fields, there is no order guarantee in
	// ApplySnapshot.
	// TODO: should go through the standard store lock when adding a replica.
	if err := r.updateRangeInfo(&desc); err != nil {
		panic(err)
	}

	r.setDescWithoutProcessUpdate(&desc)

	if !isPreemptive {
		r.store.metrics.RangeSnapshotsNormalApplied.Inc(1)
	} else {
		r.store.metrics.RangeSnapshotsPreemptiveApplied.Inc(1)
	}
	return nil
}
Exemplo n.º 3
0
// applySnapshot updates the replica based on the given snapshot.
// Returns the new last index.
func (r *Replica) applySnapshot(snap raftpb.Snapshot, typ snapshotType) (uint64, error) {
	// We use a separate batch to apply the snapshot since the Replica (and in
	// particular the last index) is updated after the batch commits. Using a
	// separate batch also allows for future optimization (such as using a
	// Distinct() batch).
	batch := r.store.Engine().NewBatch()
	defer batch.Close()

	snapData := roachpb.RaftSnapshotData{}
	err := proto.Unmarshal(snap.Data, &snapData)
	if err != nil {
		return 0, err
	}

	// Extract the updated range descriptor.
	desc := snapData.RangeDescriptor

	r.mu.Lock()
	replicaID := r.mu.replicaID
	raftLogSize := r.mu.raftLogSize
	r.mu.Unlock()

	log.Infof("replica %d received snapshot for range %d at index %d. "+
		"encoded size=%d, %d KV pairs, %d log entries",
		replicaID, desc.RangeID, snap.Metadata.Index,
		len(snap.Data), len(snapData.KV), len(snapData.LogEntries))
	defer func(start time.Time) {
		log.Infof("replica %d applied snapshot for range %d in %s",
			replicaID, desc.RangeID, timeutil.Since(start))
	}(timeutil.Now())

	// Delete everything in the range and recreate it from the snapshot.
	// We need to delete any old Raft log entries here because any log entries
	// that predate the snapshot will be orphaned and never truncated or GC'd.
	iter := NewReplicaDataIterator(&desc, batch, false /* !replicatedOnly */)
	defer iter.Close()
	for ; iter.Valid(); iter.Next() {
		if err := batch.Clear(iter.Key()); err != nil {
			return 0, err
		}
	}

	// Determine the unreplicated key prefix so we can drop any
	// unreplicated keys from the snapshot.
	unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID)

	// Write the snapshot into the range.
	for _, kv := range snapData.KV {
		if bytes.HasPrefix(kv.Key, unreplicatedPrefix) {
			continue
		}
		mvccKey := engine.MVCCKey{
			Key:       kv.Key,
			Timestamp: kv.Timestamp,
		}
		if err := batch.Put(mvccKey, kv.Value); err != nil {
			return 0, err
		}
	}

	logEntries := make([]raftpb.Entry, len(snapData.LogEntries))
	for i, bytes := range snapData.LogEntries {
		if err := logEntries[i].Unmarshal(bytes); err != nil {
			return 0, err
		}
	}

	// Write the snapshot's Raft log into the range.
	_, raftLogSize, err = r.append(batch, 0, raftLogSize, logEntries)
	if err != nil {
		return 0, err
	}

	s, err := loadState(batch, &desc)
	if err != nil {
		return 0, err
	}

	// As outlined above, last and applied index are the same after applying
	// the snapshot.
	if s.RaftAppliedIndex != snap.Metadata.Index {
		log.Fatalf("%d: snapshot resulted in appliedIndex=%d, metadataIndex=%d",
			s.Desc.RangeID, s.RaftAppliedIndex, snap.Metadata.Index)
	}

	if replicaID == 0 {
		// The replica is not part of the Raft group so we need to write the Raft
		// hard state for the replica in order for the Raft state machine to start
		// correctly.
		if err := updateHardState(batch, s); err != nil {
			return 0, err
		}
	}

	if err := batch.Commit(); err != nil {
		return 0, err
	}

	r.mu.Lock()
	// We set the persisted last index to the last applied index. This is
	// not a correctness issue, but means that we may have just transferred
	// some entries we're about to re-request from the leader and overwrite.
	// However, raft.MultiNode currently expects this behaviour, and the
	// performance implications are not likely to be drastic. If our
	// feelings about this ever change, we can add a LastIndex field to
	// raftpb.SnapshotMetadata.
	r.mu.lastIndex = s.RaftAppliedIndex
	r.mu.raftLogSize = raftLogSize
	// Update the range and store stats.
	r.store.metrics.subtractMVCCStats(r.mu.state.Stats)
	r.store.metrics.addMVCCStats(s.Stats)
	r.mu.state = s
	lastIndex := r.mu.lastIndex
	r.assertStateLocked(r.store.Engine())
	r.mu.Unlock()

	// Update other fields which are uninitialized or need updating.
	// This may not happen if the system config has not yet been loaded.
	// While config update will correctly set the fields, there is no order
	// guarantee in ApplySnapshot.
	// TODO: should go through the standard store lock when adding a replica.
	if err := r.updateRangeInfo(&desc); err != nil {
		panic(err)
	}

	// Fill the reservation if there was any one for this range.
	r.store.bookie.Fill(desc.RangeID)

	// Update the range descriptor. This is done last as this is the step that
	// makes the Replica visible in the Store.
	if err := r.setDesc(&desc); err != nil {
		panic(err)
	}

	switch typ {
	case normalSnapshot:
		r.store.metrics.rangeSnapshotsNormalApplied.Inc(1)
	case preemptiveSnapshot:
		r.store.metrics.rangeSnapshotsPreemptiveApplied.Inc(1)
	default:
		panic("not reached")
	}
	return lastIndex, nil
}
Exemplo n.º 4
0
// TestReplicaDataIterator creates three ranges {"a"-"b" (pre), "b"-"c"
// (main test range), "c"-"d" (post)} and fills each with data. It
// first verifies the contents of the "b"-"c" range. Next, it makes sure
// a replicated-only iterator does not show any unreplicated keys from
// the range. Then, it deletes the range and verifies it's empty. Finally,
// it verifies the pre and post ranges still contain the expected data.
func TestReplicaDataIterator(t *testing.T) {
	defer leaktest.AfterTest(t)()
	tc := testContext{
		bootstrapMode: bootstrapRangeOnly,
	}
	tc.Start(t)
	defer tc.Stop()

	// See notes in EmptyRange test method for adjustment to descriptor.
	newDesc := *tc.rng.Desc()
	newDesc.StartKey = roachpb.RKey("b")
	newDesc.EndKey = roachpb.RKey("c")
	if err := tc.rng.setDesc(&newDesc); err != nil {
		t.Fatal(err)
	}
	// Create two more ranges, one before the test range and one after.
	preRng := createRange(tc.store, 2, roachpb.RKeyMin, roachpb.RKey("b"))
	if err := tc.store.AddReplicaTest(preRng); err != nil {
		t.Fatal(err)
	}
	postRng := createRange(tc.store, 3, roachpb.RKey("c"), roachpb.RKeyMax)
	if err := tc.store.AddReplicaTest(postRng); err != nil {
		t.Fatal(err)
	}

	// Create range data for all three ranges.
	preKeys := createRangeData(t, preRng)
	curKeys := createRangeData(t, tc.rng)
	postKeys := createRangeData(t, postRng)

	// Verify the contents of the "b"-"c" range.
	iter := NewReplicaDataIterator(tc.rng.Desc(), tc.rng.store.Engine(), false /* !replicatedOnly */)
	defer iter.Close()
	i := 0
	for ; iter.Valid(); iter.Next() {
		if err := iter.Error(); err != nil {
			t.Fatal(err)
		}
		if i >= len(curKeys) {
			t.Fatal("there are more keys in the iteration than expected")
		}
		if key := iter.Key(); !key.Equal(curKeys[i]) {
			k1, ts1 := key.Key, key.Timestamp
			k2, ts2 := curKeys[i].Key, curKeys[i].Timestamp
			t.Errorf("%d: expected %q(%d); got %q(%d)", i, k2, ts2, k1, ts1)
		}
		i++
	}
	if i != len(curKeys) {
		t.Fatal("there are fewer keys in the iteration than expected")
	}

	// Verify that the replicated-only iterator ignores unreplicated keys.
	unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(tc.rng.RangeID)
	iter = NewReplicaDataIterator(tc.rng.Desc(), tc.rng.store.Engine(), true /* replicatedOnly */)
	defer iter.Close()
	for ; iter.Valid(); iter.Next() {
		if err := iter.Error(); err != nil {
			t.Fatal(err)
		}
		if bytes.HasPrefix(iter.Key().Key, unreplicatedPrefix) {

		}
	}

	// Destroy range and verify that its data has been completely cleared.
	if err := tc.rng.Destroy(*tc.rng.Desc()); err != nil {
		t.Fatal(err)
	}
	iter = NewReplicaDataIterator(tc.rng.Desc(), tc.rng.store.Engine(), false /* !replicatedOnly */)
	defer iter.Close()
	if iter.Valid() {
		// If the range is destroyed, only a tombstone key should be there.
		k1 := iter.Key().Key
		if tombstoneKey := keys.RaftTombstoneKey(tc.rng.RangeID); !bytes.Equal(k1, tombstoneKey) {
			t.Errorf("expected a tombstone key %q, but found %q", tombstoneKey, k1)
		}

		if iter.Next(); iter.Valid() {
			t.Errorf("expected a destroyed replica to have only a tombstone key, but found more")
		}
	} else {
		t.Errorf("expected a tombstone key, but got an empty iteration")
	}

	// Verify the keys in pre & post ranges.
	for j, test := range []struct {
		r    *Replica
		keys []engine.MVCCKey
	}{
		{preRng, preKeys},
		{postRng, postKeys},
	} {
		iter = NewReplicaDataIterator(test.r.Desc(), test.r.store.Engine(), false /* !replicatedOnly */)
		defer iter.Close()
		i = 0
		for ; iter.Valid(); iter.Next() {
			k1, ts1 := iter.Key().Key, iter.Key().Timestamp
			if bytes.HasPrefix(k1, keys.StatusPrefix) {
				// Some data is written into the system prefix by Store.BootstrapRange,
				// but it is not in our expected key list so skip it.
				// TODO(bdarnell): validate this data instead of skipping it.
				continue
			}
			if key := iter.Key(); !key.Equal(test.keys[i]) {
				k2, ts2 := test.keys[i].Key, test.keys[i].Timestamp
				t.Errorf("%d/%d: key mismatch %q(%d) != %q(%d) [%x]", j, i, k1, ts1, k2, ts2, []byte(k2))
			}
			i++
		}
		if i != len(curKeys) {
			t.Fatal("there are fewer keys in the iteration than expected")
		}
	}
}
Exemplo n.º 5
0
// applySnapshot updates the replica based on the given snapshot and associated
// HardState. The supplied HardState must be empty if a preemptive snapshot is
// being applied (which is the case if and only if the ReplicaID is zero), in
// which case it will be synthesized from any existing on-disk HardState
// appropriately. For a regular snapshot, a HardState may or may not be
// supplied, though in the common case it is (since the commit index changes as
// a result of the snapshot application, so Raft will supply us with one).
// The HardState, if altered or supplied, is persisted along with the applied
// snapshot and the new last index is returned.
//
// During preemptive snapshots, we (must) run additional safety checks. For
// example, the HardState, Raft's view of term, vote and committed log entries,
// and other Raft state (like acknowledged log entries) must not move backwards.
func (r *Replica) applySnapshot(
	snap raftpb.Snapshot, hs raftpb.HardState,
) (uint64, error) {
	// We use a separate batch to apply the snapshot since the Replica (and in
	// particular the last index) is updated after the batch commits. Using a
	// separate batch also allows for future optimization (such as using a
	// Distinct() batch).
	batch := r.store.Engine().NewBatch()
	defer batch.Close()

	snapData := roachpb.RaftSnapshotData{}
	err := proto.Unmarshal(snap.Data, &snapData)
	if err != nil {
		return 0, err
	}

	// Extract the updated range descriptor.
	desc := snapData.RangeDescriptor
	// Fill the reservation if there was one for this range, regardless of
	// whether the application succeeded.
	defer r.store.bookie.Fill(desc.RangeID)

	r.mu.Lock()
	replicaID := r.mu.replicaID
	raftLogSize := r.mu.raftLogSize
	r.mu.Unlock()

	isPreemptive := replicaID == 0

	replicaIDStr := "[?]"
	snapType := "preemptive"
	if !isPreemptive {
		replicaIDStr = strconv.FormatInt(int64(replicaID), 10)
		snapType = "Raft"
	}

	log.Infof("%s: with replicaID %s, applying %s snapshot for range %d at index %d "+
		"(encoded size=%d, %d KV pairs, %d log entries)",
		r, replicaIDStr, snapType, desc.RangeID, snap.Metadata.Index,
		len(snap.Data), len(snapData.KV), len(snapData.LogEntries))
	defer func(start time.Time) {
		log.Infof("%s: with replicaID %s, applied %s snapshot for range %d in %s",
			r, replicaIDStr, snapType, desc.RangeID, timeutil.Since(start))
	}(timeutil.Now())

	// Remember the old last index to verify that the snapshot doesn't wipe out
	// log entries which have been acknowledged, which is possible with
	// preemptive snapshots. We assert on it later in this call.
	oldLastIndex, err := loadLastIndex(batch, desc.RangeID)
	if err != nil {
		return 0, errors.Wrap(err, "error loading last index")
	}
	// Similar strategy for the HardState.
	oldHardState, err := loadHardState(batch, desc.RangeID)
	if err != nil {
		return 0, errors.Wrap(err, "unable to load HardState")
	}

	// Delete everything in the range and recreate it from the snapshot.
	// We need to delete any old Raft log entries here because any log entries
	// that predate the snapshot will be orphaned and never truncated or GC'd.
	iter := NewReplicaDataIterator(&desc, batch, false /* !replicatedOnly */)
	defer iter.Close()
	for ; iter.Valid(); iter.Next() {
		if err := batch.Clear(iter.Key()); err != nil {
			return 0, err
		}
	}

	// Determine the unreplicated key prefix so we can drop any
	// unreplicated keys from the snapshot.
	unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID)

	// Write the snapshot into the range.
	for _, kv := range snapData.KV {
		if bytes.HasPrefix(kv.Key, unreplicatedPrefix) {
			continue
		}
		mvccKey := engine.MVCCKey{
			Key:       kv.Key,
			Timestamp: kv.Timestamp,
		}
		if err := batch.Put(mvccKey, kv.Value); err != nil {
			return 0, err
		}
	}

	logEntries := make([]raftpb.Entry, len(snapData.LogEntries))
	for i, bytes := range snapData.LogEntries {
		if err := logEntries[i].Unmarshal(bytes); err != nil {
			return 0, err
		}
	}

	// Write the snapshot's Raft log into the range.
	_, raftLogSize, err = r.append(batch, 0, raftLogSize, logEntries)
	if err != nil {
		return 0, err
	}

	s, err := loadState(batch, &desc)
	if err != nil {
		return 0, err
	}

	// As outlined above, last and applied index are the same after applying
	// the snapshot (i.e. the snapshot has no uncommitted tail).
	if s.RaftAppliedIndex != snap.Metadata.Index {
		log.Fatalf("%s with state loaded from %d: snapshot resulted in appliedIndex=%d, metadataIndex=%d",
			r, s.Desc.RangeID, s.RaftAppliedIndex, snap.Metadata.Index)
	}

	if !raft.IsEmptyHardState(hs) {
		if isPreemptive {
			return 0, errors.Errorf("unexpected HardState %+v on preemptive snapshot", &hs)
		}
		if err := setHardState(batch, s.Desc.RangeID, hs); err != nil {
			return 0, errors.Wrapf(err, "unable to persist HardState %+v", &hs)
		}
	} else if isPreemptive {
		// Preemptive snapshots get special verifications (see #7619) of their
		// last index and (necessarily synthesized) HardState.
		if snap.Metadata.Index < oldLastIndex {
			// We are not aware of a specific way in which this could happen
			// (Raft itself should not emit such snapshots, and no Replica can
			// ever apply two preemptive snapshots), but it doesn't hurt to
			// check.
			return 0, errors.Errorf("%s: preemptive snapshot would erase acknowledged log entries",
				r)
		}
		if snap.Metadata.Term < oldHardState.Term {
			return 0, errors.Errorf("%s: cannot apply preemptive snapshot from past term %d at term %d",
				r, snap.Metadata.Term, oldHardState.Term)
		}
		if err := synthesizeHardState(batch, s, oldHardState); err != nil {
			return 0, errors.Wrapf(err, "%s: unable to write synthesized HardState", r)
		}
	} else {
		// Note that we don't require that Raft supply us with a nonempty
		// HardState on a snapshot. We don't want to make that assumption
		// because it's not guaranteed by the contract. Raft *must* send us
		// a HardState when it increases the committed index as a result of the
		// snapshot, but who is to say it isn't going to accept a snapshot
		// which is identical to the current state?
	}

	if err := batch.Commit(); err != nil {
		return 0, err
	}

	r.mu.Lock()
	// We set the persisted last index to the last applied index. This is
	// not a correctness issue, but means that we may have just transferred
	// some entries we're about to re-request from the leader and overwrite.
	// However, raft.MultiNode currently expects this behaviour, and the
	// performance implications are not likely to be drastic. If our
	// feelings about this ever change, we can add a LastIndex field to
	// raftpb.SnapshotMetadata.
	r.mu.lastIndex = s.RaftAppliedIndex
	r.mu.raftLogSize = raftLogSize
	// Update the range and store stats.
	r.store.metrics.subtractMVCCStats(r.mu.state.Stats)
	r.store.metrics.addMVCCStats(s.Stats)
	r.mu.state = s
	lastIndex := r.mu.lastIndex
	r.assertStateLocked(r.store.Engine())
	r.mu.Unlock()

	// Update other fields which are uninitialized or need updating.
	// This may not happen if the system config has not yet been loaded.
	// While config update will correctly set the fields, there is no order
	// guarantee in ApplySnapshot.
	// TODO: should go through the standard store lock when adding a replica.
	if err := r.updateRangeInfo(&desc); err != nil {
		panic(err)
	}
	// Update the range descriptor. This is done last as this is the step that
	// makes the Replica visible in the Store.
	if err := r.setDesc(&desc); err != nil {
		panic(err)
	}

	if !isPreemptive {
		r.store.metrics.rangeSnapshotsNormalApplied.Inc(1)
	} else {
		r.store.metrics.rangeSnapshotsPreemptiveApplied.Inc(1)
	}
	return lastIndex, nil
}
Exemplo n.º 6
0
// applySnapshot updates the replica based on the given snapshot.
// Returns the new last index.
func (r *Replica) applySnapshot(batch engine.Batch, snap raftpb.Snapshot) (uint64, error) {
	snapData := roachpb.RaftSnapshotData{}
	err := proto.Unmarshal(snap.Data, &snapData)
	if err != nil {
		return 0, err
	}

	rangeID := r.RangeID

	// Extract the updated range descriptor.
	desc := snapData.RangeDescriptor

	r.mu.Lock()
	replicaID := r.mu.replicaID
	r.mu.Unlock()

	log.Infof("replica %d received snapshot for range %d at index %d. encoded size=%d, %d KV pairs, %d log entries",
		replicaID, rangeID, snap.Metadata.Index, len(snap.Data), len(snapData.KV), len(snapData.LogEntries))
	defer func(start time.Time) {
		log.Infof("replica %d applied snapshot for range %d in %s", replicaID, rangeID, timeutil.Since(start))
	}(timeutil.Now())

	// Delete everything in the range and recreate it from the snapshot.
	// We need to delete any old Raft log entries here because any log entries
	// that predate the snapshot will be orphaned and never truncated or GC'd.
	iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */)
	defer iter.Close()
	for ; iter.Valid(); iter.Next() {
		if err := batch.Clear(iter.Key()); err != nil {
			return 0, err
		}
	}

	// Determine the unreplicated key prefix so we can drop any
	// unreplicated keys from the snapshot.
	unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID)

	// Write the snapshot into the range.
	for _, kv := range snapData.KV {
		if bytes.HasPrefix(kv.Key, unreplicatedPrefix) {
			continue
		}
		mvccKey := engine.MVCCKey{
			Key:       kv.Key,
			Timestamp: kv.Timestamp,
		}
		if err := batch.Put(mvccKey, kv.Value); err != nil {
			return 0, err
		}
	}

	logEntries := make([]raftpb.Entry, len(snapData.LogEntries))
	for i, bytes := range snapData.LogEntries {
		if err := logEntries[i].Unmarshal(bytes); err != nil {
			return 0, err
		}
	}

	// Write the snapshot's Raft log into the range.
	if _, err := r.append(batch, 0, logEntries); err != nil {
		return 0, err
	}

	// Read the leader lease.
	lease, err := loadLeaderLease(batch, desc.RangeID)
	if err != nil {
		return 0, err
	}

	frozen, err := loadFrozenStatus(batch, desc.RangeID)
	if err != nil {
		return 0, err
	}

	lastThreshold, err := loadGCThreshold(batch, desc.RangeID)
	if err != nil {
		return 0, err
	}

	// Load updated range stats. The local newStats variable will be assigned
	// to r.stats after the batch commits.
	newStats, err := newRangeStats(desc.RangeID, batch)
	if err != nil {
		return 0, err
	}

	// The next line sets the persisted last index to the last applied index.
	// This is not a correctness issue, but means that we may have just
	// transferred some entries we're about to re-request from the leader and
	// overwrite.
	// However, raft.MultiNode currently expects this behaviour, and the
	// performance implications are not likely to be drastic. If our feelings
	// about this ever change, we can add a LastIndex field to
	// raftpb.SnapshotMetadata.
	if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil {
		return 0, err
	}

	batch.Defer(func() {
		// Update the range and store stats.
		r.store.metrics.subtractMVCCStats(r.stats.mvccStats)
		r.stats.Replace(newStats)
		r.store.metrics.addMVCCStats(r.stats.mvccStats)

		r.mu.Lock()
		// As outlined above, last and applied index are the same after applying
		// the snapshot.
		r.mu.appliedIndex = snap.Metadata.Index
		r.mu.leaderLease = lease
		r.mu.frozen = frozen
		r.mu.gcThreshold = lastThreshold
		r.mu.Unlock()

		// Update other fields which are uninitialized or need updating.
		// This may not happen if the system config has not yet been loaded.
		// While config update will correctly set the fields, there is no order
		// guarantee in ApplySnapshot.
		// TODO: should go through the standard store lock when adding a replica.
		if err := r.updateRangeInfo(&desc); err != nil {
			panic(err)
		}

		// Fill the reservation if there was any one for this range.
		r.store.bookie.Fill(rangeID)

		// Update the range descriptor. This is done last as this is the step that
		// makes the Replica visible in the Store.
		if err := r.setDesc(&desc); err != nil {
			panic(err)
		}
	})
	return snap.Metadata.Index, nil
}