// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(batch engine.Engine, snap raftpb.Snapshot) (uint64, error) { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } rangeID := r.RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. This is // usually unnecessary because a snapshot is nearly always // accompanied by a new HardState which incorporates both our former // state and new information from the leader, but in the event that // the HardState has not changed, we want to use our own previous // HardState and not one that was transmitted via the snapshot. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(batch, hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. if _, err := r.append(batch, 0, snapData.LogEntries); err != nil { return 0, err } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return 0, err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return 0, err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return 0, err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return 0, err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return 0, err } batch.Defer(func() { // Update the range stats. r.stats.Replace(newStats) r.mu.Lock() // As outlined above, last and applied index are the same after applying // the snapshot. r.mu.appliedIndex = snap.Metadata.Index r.mu.leaderLease = lease r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } }) return snap.Metadata.Index, nil }
// applySnapshot updates the replica based on the given snapshot and associated // HardState (which may be empty, as Raft may apply some snapshots which don't // require an update to the HardState). All snapshots must pass through Raft // for correctness, i.e. the parameters to this method must be taken from // a raft.Ready. It is the caller's responsibility to call // r.store.processRangeDescriptorUpdate(r) after a successful applySnapshot. func (r *Replica) applySnapshot( ctx context.Context, snap raftpb.Snapshot, hs raftpb.HardState, ) error { // We use a separate batch to apply the snapshot since the Replica (and in // particular the last index) is updated after the batch commits. Using a // separate batch also allows for future optimization (such as using a // Distinct() batch). batch := r.store.Engine().NewBatch() defer batch.Close() snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Fill the reservation if there was one for this range, regardless of // whether the application succeeded. defer r.store.bookie.Fill(desc.RangeID) r.mu.Lock() replicaID := r.mu.replicaID raftLogSize := r.mu.raftLogSize r.mu.Unlock() isPreemptive := replicaID == 0 // only used for accounting and log format replicaIDStr := "[?]" snapType := "preemptive" if !isPreemptive { replicaIDStr = strconv.FormatInt(int64(replicaID), 10) snapType = "Raft" } log.Infof(ctx, "%s: with replicaID %s, applying %s snapshot at index %d "+ "(encoded size=%d, %d KV pairs, %d log entries)", r, replicaIDStr, snapType, snap.Metadata.Index, len(snap.Data), len(snapData.KV), len(snapData.LogEntries)) defer func(start time.Time) { log.Infof(ctx, "%s: with replicaID %s, applied %s snapshot in %.3fs", r, replicaIDStr, snapType, timeutil.Since(start).Seconds()) }(timeutil.Now()) // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. // // The KVs and log entries are all written to distinct keys so we can use a // distinct batch. Note that we clear keys here that are potentially // overwritten below, which violates the spirit of the distinct batch. This // is safe because we don't do any reads until after the distinct batch is // closed (the raft log writes are "blind"). distinctBatch := batch.Distinct() iter := NewReplicaDataIterator(&desc, distinctBatch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := distinctBatch.Clear(iter.Key()); err != nil { return err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := distinctBatch.Put(mvccKey, kv.Value); err != nil { return err } } logEntries := make([]raftpb.Entry, len(snapData.LogEntries)) for i, bytes := range snapData.LogEntries { if err := logEntries[i].Unmarshal(bytes); err != nil { return err } } // Write the snapshot's Raft log into the range. _, raftLogSize, err = r.append(ctx, distinctBatch, 0, raftLogSize, logEntries) if err != nil { return err } if !raft.IsEmptyHardState(hs) { if err := setHardState(ctx, distinctBatch, r.RangeID, hs); err != nil { return errors.Wrapf(err, "unable to persist HardState %+v", &hs) } } else { // Note that we don't require that Raft supply us with a nonempty // HardState on a snapshot. We don't want to make that assumption // because it's not guaranteed by the contract. Raft *must* send us // a HardState when it increases the committed index as a result of the // snapshot, but who is to say it isn't going to accept a snapshot // which is identical to the current state? } // We need to close the distinct batch and start using the normal batch for // the read below. distinctBatch.Close() s, err := loadState(ctx, batch, &desc) if err != nil { return err } if s.Desc.RangeID != r.RangeID { log.Fatalf(ctx, "%s: unexpected range ID %d", r, s.Desc.RangeID) } // As outlined above, last and applied index are the same after applying // the snapshot (i.e. the snapshot has no uncommitted tail). if s.RaftAppliedIndex != snap.Metadata.Index { log.Fatalf(ctx, "%s: with state loaded from %d: snapshot resulted in appliedIndex=%d, metadataIndex=%d", r, s.Desc.RangeID, s.RaftAppliedIndex, snap.Metadata.Index) } if err := batch.Commit(); err != nil { return err } r.mu.Lock() // We set the persisted last index to the last applied index. This is // not a correctness issue, but means that we may have just transferred // some entries we're about to re-request from the leader and overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our // feelings about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. r.mu.lastIndex = s.RaftAppliedIndex r.mu.raftLogSize = raftLogSize // Update the range and store stats. r.store.metrics.subtractMVCCStats(r.mu.state.Stats) r.store.metrics.addMVCCStats(s.Stats) r.mu.state = s r.assertStateLocked(r.store.Engine()) r.mu.Unlock() // As the last deferred action after committing the batch, update other // fields which are uninitialized or need updating. This may not happen // if the system config has not yet been loaded. While config update // will correctly set the fields, there is no order guarantee in // ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } r.setDescWithoutProcessUpdate(&desc) if !isPreemptive { r.store.metrics.RangeSnapshotsNormalApplied.Inc(1) } else { r.store.metrics.RangeSnapshotsPreemptiveApplied.Inc(1) } return nil }
// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(snap raftpb.Snapshot, typ snapshotType) (uint64, error) { // We use a separate batch to apply the snapshot since the Replica (and in // particular the last index) is updated after the batch commits. Using a // separate batch also allows for future optimization (such as using a // Distinct() batch). batch := r.store.Engine().NewBatch() defer batch.Close() snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor r.mu.Lock() replicaID := r.mu.replicaID raftLogSize := r.mu.raftLogSize r.mu.Unlock() log.Infof("replica %d received snapshot for range %d at index %d. "+ "encoded size=%d, %d KV pairs, %d log entries", replicaID, desc.RangeID, snap.Metadata.Index, len(snap.Data), len(snapData.KV), len(snapData.LogEntries)) defer func(start time.Time) { log.Infof("replica %d applied snapshot for range %d in %s", replicaID, desc.RangeID, timeutil.Since(start)) }(timeutil.Now()) // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := NewReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } logEntries := make([]raftpb.Entry, len(snapData.LogEntries)) for i, bytes := range snapData.LogEntries { if err := logEntries[i].Unmarshal(bytes); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. _, raftLogSize, err = r.append(batch, 0, raftLogSize, logEntries) if err != nil { return 0, err } s, err := loadState(batch, &desc) if err != nil { return 0, err } // As outlined above, last and applied index are the same after applying // the snapshot. if s.RaftAppliedIndex != snap.Metadata.Index { log.Fatalf("%d: snapshot resulted in appliedIndex=%d, metadataIndex=%d", s.Desc.RangeID, s.RaftAppliedIndex, snap.Metadata.Index) } if replicaID == 0 { // The replica is not part of the Raft group so we need to write the Raft // hard state for the replica in order for the Raft state machine to start // correctly. if err := updateHardState(batch, s); err != nil { return 0, err } } if err := batch.Commit(); err != nil { return 0, err } r.mu.Lock() // We set the persisted last index to the last applied index. This is // not a correctness issue, but means that we may have just transferred // some entries we're about to re-request from the leader and overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our // feelings about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. r.mu.lastIndex = s.RaftAppliedIndex r.mu.raftLogSize = raftLogSize // Update the range and store stats. r.store.metrics.subtractMVCCStats(r.mu.state.Stats) r.store.metrics.addMVCCStats(s.Stats) r.mu.state = s lastIndex := r.mu.lastIndex r.assertStateLocked(r.store.Engine()) r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Fill the reservation if there was any one for this range. r.store.bookie.Fill(desc.RangeID) // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } switch typ { case normalSnapshot: r.store.metrics.rangeSnapshotsNormalApplied.Inc(1) case preemptiveSnapshot: r.store.metrics.rangeSnapshotsPreemptiveApplied.Inc(1) default: panic("not reached") } return lastIndex, nil }
// TestReplicaDataIterator creates three ranges {"a"-"b" (pre), "b"-"c" // (main test range), "c"-"d" (post)} and fills each with data. It // first verifies the contents of the "b"-"c" range. Next, it makes sure // a replicated-only iterator does not show any unreplicated keys from // the range. Then, it deletes the range and verifies it's empty. Finally, // it verifies the pre and post ranges still contain the expected data. func TestReplicaDataIterator(t *testing.T) { defer leaktest.AfterTest(t)() tc := testContext{ bootstrapMode: bootstrapRangeOnly, } tc.Start(t) defer tc.Stop() // See notes in EmptyRange test method for adjustment to descriptor. newDesc := *tc.rng.Desc() newDesc.StartKey = roachpb.RKey("b") newDesc.EndKey = roachpb.RKey("c") if err := tc.rng.setDesc(&newDesc); err != nil { t.Fatal(err) } // Create two more ranges, one before the test range and one after. preRng := createRange(tc.store, 2, roachpb.RKeyMin, roachpb.RKey("b")) if err := tc.store.AddReplicaTest(preRng); err != nil { t.Fatal(err) } postRng := createRange(tc.store, 3, roachpb.RKey("c"), roachpb.RKeyMax) if err := tc.store.AddReplicaTest(postRng); err != nil { t.Fatal(err) } // Create range data for all three ranges. preKeys := createRangeData(t, preRng) curKeys := createRangeData(t, tc.rng) postKeys := createRangeData(t, postRng) // Verify the contents of the "b"-"c" range. iter := NewReplicaDataIterator(tc.rng.Desc(), tc.rng.store.Engine(), false /* !replicatedOnly */) defer iter.Close() i := 0 for ; iter.Valid(); iter.Next() { if err := iter.Error(); err != nil { t.Fatal(err) } if i >= len(curKeys) { t.Fatal("there are more keys in the iteration than expected") } if key := iter.Key(); !key.Equal(curKeys[i]) { k1, ts1 := key.Key, key.Timestamp k2, ts2 := curKeys[i].Key, curKeys[i].Timestamp t.Errorf("%d: expected %q(%d); got %q(%d)", i, k2, ts2, k1, ts1) } i++ } if i != len(curKeys) { t.Fatal("there are fewer keys in the iteration than expected") } // Verify that the replicated-only iterator ignores unreplicated keys. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(tc.rng.RangeID) iter = NewReplicaDataIterator(tc.rng.Desc(), tc.rng.store.Engine(), true /* replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := iter.Error(); err != nil { t.Fatal(err) } if bytes.HasPrefix(iter.Key().Key, unreplicatedPrefix) { } } // Destroy range and verify that its data has been completely cleared. if err := tc.rng.Destroy(*tc.rng.Desc()); err != nil { t.Fatal(err) } iter = NewReplicaDataIterator(tc.rng.Desc(), tc.rng.store.Engine(), false /* !replicatedOnly */) defer iter.Close() if iter.Valid() { // If the range is destroyed, only a tombstone key should be there. k1 := iter.Key().Key if tombstoneKey := keys.RaftTombstoneKey(tc.rng.RangeID); !bytes.Equal(k1, tombstoneKey) { t.Errorf("expected a tombstone key %q, but found %q", tombstoneKey, k1) } if iter.Next(); iter.Valid() { t.Errorf("expected a destroyed replica to have only a tombstone key, but found more") } } else { t.Errorf("expected a tombstone key, but got an empty iteration") } // Verify the keys in pre & post ranges. for j, test := range []struct { r *Replica keys []engine.MVCCKey }{ {preRng, preKeys}, {postRng, postKeys}, } { iter = NewReplicaDataIterator(test.r.Desc(), test.r.store.Engine(), false /* !replicatedOnly */) defer iter.Close() i = 0 for ; iter.Valid(); iter.Next() { k1, ts1 := iter.Key().Key, iter.Key().Timestamp if bytes.HasPrefix(k1, keys.StatusPrefix) { // Some data is written into the system prefix by Store.BootstrapRange, // but it is not in our expected key list so skip it. // TODO(bdarnell): validate this data instead of skipping it. continue } if key := iter.Key(); !key.Equal(test.keys[i]) { k2, ts2 := test.keys[i].Key, test.keys[i].Timestamp t.Errorf("%d/%d: key mismatch %q(%d) != %q(%d) [%x]", j, i, k1, ts1, k2, ts2, []byte(k2)) } i++ } if i != len(curKeys) { t.Fatal("there are fewer keys in the iteration than expected") } } }
// applySnapshot updates the replica based on the given snapshot and associated // HardState. The supplied HardState must be empty if a preemptive snapshot is // being applied (which is the case if and only if the ReplicaID is zero), in // which case it will be synthesized from any existing on-disk HardState // appropriately. For a regular snapshot, a HardState may or may not be // supplied, though in the common case it is (since the commit index changes as // a result of the snapshot application, so Raft will supply us with one). // The HardState, if altered or supplied, is persisted along with the applied // snapshot and the new last index is returned. // // During preemptive snapshots, we (must) run additional safety checks. For // example, the HardState, Raft's view of term, vote and committed log entries, // and other Raft state (like acknowledged log entries) must not move backwards. func (r *Replica) applySnapshot( snap raftpb.Snapshot, hs raftpb.HardState, ) (uint64, error) { // We use a separate batch to apply the snapshot since the Replica (and in // particular the last index) is updated after the batch commits. Using a // separate batch also allows for future optimization (such as using a // Distinct() batch). batch := r.store.Engine().NewBatch() defer batch.Close() snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Fill the reservation if there was one for this range, regardless of // whether the application succeeded. defer r.store.bookie.Fill(desc.RangeID) r.mu.Lock() replicaID := r.mu.replicaID raftLogSize := r.mu.raftLogSize r.mu.Unlock() isPreemptive := replicaID == 0 replicaIDStr := "[?]" snapType := "preemptive" if !isPreemptive { replicaIDStr = strconv.FormatInt(int64(replicaID), 10) snapType = "Raft" } log.Infof("%s: with replicaID %s, applying %s snapshot for range %d at index %d "+ "(encoded size=%d, %d KV pairs, %d log entries)", r, replicaIDStr, snapType, desc.RangeID, snap.Metadata.Index, len(snap.Data), len(snapData.KV), len(snapData.LogEntries)) defer func(start time.Time) { log.Infof("%s: with replicaID %s, applied %s snapshot for range %d in %s", r, replicaIDStr, snapType, desc.RangeID, timeutil.Since(start)) }(timeutil.Now()) // Remember the old last index to verify that the snapshot doesn't wipe out // log entries which have been acknowledged, which is possible with // preemptive snapshots. We assert on it later in this call. oldLastIndex, err := loadLastIndex(batch, desc.RangeID) if err != nil { return 0, errors.Wrap(err, "error loading last index") } // Similar strategy for the HardState. oldHardState, err := loadHardState(batch, desc.RangeID) if err != nil { return 0, errors.Wrap(err, "unable to load HardState") } // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := NewReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } logEntries := make([]raftpb.Entry, len(snapData.LogEntries)) for i, bytes := range snapData.LogEntries { if err := logEntries[i].Unmarshal(bytes); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. _, raftLogSize, err = r.append(batch, 0, raftLogSize, logEntries) if err != nil { return 0, err } s, err := loadState(batch, &desc) if err != nil { return 0, err } // As outlined above, last and applied index are the same after applying // the snapshot (i.e. the snapshot has no uncommitted tail). if s.RaftAppliedIndex != snap.Metadata.Index { log.Fatalf("%s with state loaded from %d: snapshot resulted in appliedIndex=%d, metadataIndex=%d", r, s.Desc.RangeID, s.RaftAppliedIndex, snap.Metadata.Index) } if !raft.IsEmptyHardState(hs) { if isPreemptive { return 0, errors.Errorf("unexpected HardState %+v on preemptive snapshot", &hs) } if err := setHardState(batch, s.Desc.RangeID, hs); err != nil { return 0, errors.Wrapf(err, "unable to persist HardState %+v", &hs) } } else if isPreemptive { // Preemptive snapshots get special verifications (see #7619) of their // last index and (necessarily synthesized) HardState. if snap.Metadata.Index < oldLastIndex { // We are not aware of a specific way in which this could happen // (Raft itself should not emit such snapshots, and no Replica can // ever apply two preemptive snapshots), but it doesn't hurt to // check. return 0, errors.Errorf("%s: preemptive snapshot would erase acknowledged log entries", r) } if snap.Metadata.Term < oldHardState.Term { return 0, errors.Errorf("%s: cannot apply preemptive snapshot from past term %d at term %d", r, snap.Metadata.Term, oldHardState.Term) } if err := synthesizeHardState(batch, s, oldHardState); err != nil { return 0, errors.Wrapf(err, "%s: unable to write synthesized HardState", r) } } else { // Note that we don't require that Raft supply us with a nonempty // HardState on a snapshot. We don't want to make that assumption // because it's not guaranteed by the contract. Raft *must* send us // a HardState when it increases the committed index as a result of the // snapshot, but who is to say it isn't going to accept a snapshot // which is identical to the current state? } if err := batch.Commit(); err != nil { return 0, err } r.mu.Lock() // We set the persisted last index to the last applied index. This is // not a correctness issue, but means that we may have just transferred // some entries we're about to re-request from the leader and overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our // feelings about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. r.mu.lastIndex = s.RaftAppliedIndex r.mu.raftLogSize = raftLogSize // Update the range and store stats. r.store.metrics.subtractMVCCStats(r.mu.state.Stats) r.store.metrics.addMVCCStats(s.Stats) r.mu.state = s lastIndex := r.mu.lastIndex r.assertStateLocked(r.store.Engine()) r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } if !isPreemptive { r.store.metrics.rangeSnapshotsNormalApplied.Inc(1) } else { r.store.metrics.rangeSnapshotsPreemptiveApplied.Inc(1) } return lastIndex, nil }
// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(batch engine.Batch, snap raftpb.Snapshot) (uint64, error) { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } rangeID := r.RangeID // Extract the updated range descriptor. desc := snapData.RangeDescriptor r.mu.Lock() replicaID := r.mu.replicaID r.mu.Unlock() log.Infof("replica %d received snapshot for range %d at index %d. encoded size=%d, %d KV pairs, %d log entries", replicaID, rangeID, snap.Metadata.Index, len(snap.Data), len(snapData.KV), len(snapData.LogEntries)) defer func(start time.Time) { log.Infof("replica %d applied snapshot for range %d in %s", replicaID, rangeID, timeutil.Since(start)) }(timeutil.Now()) // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } logEntries := make([]raftpb.Entry, len(snapData.LogEntries)) for i, bytes := range snapData.LogEntries { if err := logEntries[i].Unmarshal(bytes); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. if _, err := r.append(batch, 0, logEntries); err != nil { return 0, err } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return 0, err } frozen, err := loadFrozenStatus(batch, desc.RangeID) if err != nil { return 0, err } lastThreshold, err := loadGCThreshold(batch, desc.RangeID) if err != nil { return 0, err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return 0, err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return 0, err } batch.Defer(func() { // Update the range and store stats. r.store.metrics.subtractMVCCStats(r.stats.mvccStats) r.stats.Replace(newStats) r.store.metrics.addMVCCStats(r.stats.mvccStats) r.mu.Lock() // As outlined above, last and applied index are the same after applying // the snapshot. r.mu.appliedIndex = snap.Metadata.Index r.mu.leaderLease = lease r.mu.frozen = frozen r.mu.gcThreshold = lastThreshold r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Fill the reservation if there was any one for this range. r.store.bookie.Fill(rangeID) // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } }) return snap.Metadata.Index, nil }