// append the given entries to the raft log. func (r *Replica) append(batch engine.Engine, entries []raftpb.Entry) error { if len(entries) == 0 { return nil } for _, ent := range entries { err := engine.MVCCPutProto(batch, nil, keys.RaftLogKey(r.RangeID, ent.Index), roachpb.ZeroTimestamp, nil, &ent) if err != nil { return err } } lastIndex := entries[len(entries)-1].Index prevLastIndex := atomic.LoadUint64(&r.lastIndex) // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(batch, nil, keys.RaftLogKey(r.RangeID, i), roachpb.ZeroTimestamp, nil) if err != nil { return err } } // Commit the batch and update the last index. if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil { return err } batch.Defer(func() { atomic.StoreUint64(&r.lastIndex, lastIndex) }) return nil }
// mergeTrigger is called on a successful commit of an AdminMerge // transaction. It recomputes stats for the receiving range. func (r *Range) mergeTrigger(batch engine.Engine, merge *proto.MergeTrigger) error { if !bytes.Equal(r.Desc().StartKey, merge.UpdatedDesc.StartKey) { return util.Errorf("range and updated range start keys do not match: %s != %s", r.Desc().StartKey, merge.UpdatedDesc.StartKey) } if !r.Desc().EndKey.Less(merge.UpdatedDesc.EndKey) { return util.Errorf("range end key is not less than the post merge end key: %s >= %s", r.Desc().EndKey, merge.UpdatedDesc.EndKey) } if merge.SubsumedRaftID <= 0 { return util.Errorf("subsumed raft ID must be provided: %d", merge.SubsumedRaftID) } // Copy the subsumed range's response cache to the subsuming one. if err := r.respCache.CopyFrom(batch, merge.SubsumedRaftID); err != nil { return util.Errorf("unable to copy response cache to new split range: %s", err) } // Compute stats for updated range. now := r.rm.Clock().Timestamp() iter := newRangeDataIterator(&merge.UpdatedDesc, batch) ms, err := engine.MVCCComputeStats(iter, now.WallTime) iter.Close() if err != nil { return util.Errorf("unable to compute stats for the range after merge: %s", err) } if err = r.stats.SetMVCCStats(batch, ms); err != nil { return util.Errorf("unable to write MVCC stats: %s", err) } // Clear the timestamp cache. In the case that this replica and the // subsumed replica each held their respective leader leases, we // could merge the timestamp caches for efficiency. But it's unlikely // and not worth the extra logic and potential for error. r.Lock() r.tsCache.Clear(r.rm.Clock()) r.Unlock() batch.Defer(func() { if err := r.rm.MergeRange(r, merge.UpdatedDesc.EndKey, merge.SubsumedRaftID); err != nil { // Our in-memory state has diverged from the on-disk state. log.Fatalf("failed to update store after merging range: %s", err) } }) return nil }
// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(batch engine.Engine, snap raftpb.Snapshot) (uint64, error) { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } rangeID := r.RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. This is // usually unnecessary because a snapshot is nearly always // accompanied by a new HardState which incorporates both our former // state and new information from the leader, but in the event that // the HardState has not changed, we want to use our own previous // HardState and not one that was transmitted via the snapshot. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(batch, hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. if _, err := r.append(batch, 0, snapData.LogEntries); err != nil { return 0, err } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return 0, err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return 0, err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return 0, err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return 0, err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return 0, err } batch.Defer(func() { // Update the range stats. r.stats.Replace(newStats) r.mu.Lock() // As outlined above, last and applied index are the same after applying // the snapshot. r.mu.appliedIndex = snap.Metadata.Index r.mu.leaderLease = lease r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } }) return snap.Metadata.Index, nil }
// splitTrigger is called on a successful commit of an AdminSplit // transaction. It copies the response cache for the new range and // recomputes stats for both the existing, updated range and the new // range. func (r *Range) splitTrigger(batch engine.Engine, split *proto.SplitTrigger) error { if !bytes.Equal(r.Desc().StartKey, split.UpdatedDesc.StartKey) || !bytes.Equal(r.Desc().EndKey, split.NewDesc.EndKey) { return util.Errorf("range does not match splits: (%s-%s) + (%s-%s) != %s", split.UpdatedDesc.StartKey, split.UpdatedDesc.EndKey, split.NewDesc.StartKey, split.NewDesc.EndKey, r) } // Copy the GC metadata. gcMeta, err := r.GetGCMetadata() if err != nil { return util.Errorf("unable to fetch GC metadata: %s", err) } if err := engine.MVCCPutProto(batch, nil, keys.RangeGCMetadataKey(split.NewDesc.RaftID), proto.ZeroTimestamp, nil, gcMeta); err != nil { return util.Errorf("unable to copy GC metadata: %s", err) } // Copy the last verification timestamp. verifyTS, err := r.GetLastVerificationTimestamp() if err != nil { return util.Errorf("unable to fetch last verification timestamp: %s", err) } if err := engine.MVCCPutProto(batch, nil, keys.RangeLastVerificationTimestampKey(split.NewDesc.RaftID), proto.ZeroTimestamp, nil, &verifyTS); err != nil { return util.Errorf("unable to copy last verification timestamp: %s", err) } // Compute stats for updated range. now := r.rm.Clock().Timestamp() iter := newRangeDataIterator(&split.UpdatedDesc, batch) ms, err := engine.MVCCComputeStats(iter, now.WallTime) iter.Close() if err != nil { return util.Errorf("unable to compute stats for updated range after split: %s", err) } if err := r.stats.SetMVCCStats(batch, ms); err != nil { return util.Errorf("unable to write MVCC stats: %s", err) } // Initialize the new range's response cache by copying the original's. if err = r.respCache.CopyInto(batch, split.NewDesc.RaftID); err != nil { return util.Errorf("unable to copy response cache to new split range: %s", err) } // Add the new split range to the store. This step atomically // updates the EndKey of the updated range and also adds the // new range to the store's range map. newRng, err := NewRange(&split.NewDesc, r.rm) if err != nil { return err } // Compute stats for new range. iter = newRangeDataIterator(&split.NewDesc, batch) ms, err = engine.MVCCComputeStats(iter, now.WallTime) iter.Close() if err != nil { return util.Errorf("unable to compute stats for new range after split: %s", err) } if err = newRng.stats.SetMVCCStats(batch, ms); err != nil { return util.Errorf("unable to write MVCC stats: %s", err) } // Copy the timestamp cache into the new range. r.Lock() r.tsCache.MergeInto(newRng.tsCache, true /* clear */) r.Unlock() batch.Defer(func() { if err := r.rm.SplitRange(r, newRng); err != nil { // Our in-memory state has diverged from the on-disk state. log.Fatalf("failed to update Store after split: %s", err) } }) return nil }