// append the given entries to the raft log. Takes the previous value // of r.lastIndex and returns a new value. We do this rather than // modifying r.lastIndex directly because this modification needs to // be atomic with the commit of the batch. func (r *Replica) append(batch engine.Engine, prevLastIndex uint64, entries []raftpb.Entry) (uint64, error) { if len(entries) == 0 { return prevLastIndex, nil } for i := range entries { ent := &entries[i] key := keys.RaftLogKey(r.RangeID, ent.Index) if err := engine.MVCCPutProto(batch, nil, key, roachpb.ZeroTimestamp, nil, ent); err != nil { return 0, err } } lastIndex := entries[len(entries)-1].Index // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(batch, nil, keys.RaftLogKey(r.RangeID, i), roachpb.ZeroTimestamp, nil) if err != nil { return 0, err } } // Commit the batch and update the last index. if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil { return 0, err } return lastIndex, nil }
// append the given entries to the raft log. Takes the previous values of // r.mu.lastIndex and r.mu.raftLogSize, and returns new values. We do this // rather than modifying them directly because these modifications need to be // atomic with the commit of the batch. func (r *Replica) append(batch engine.ReadWriter, prevLastIndex uint64, prevRaftLogSize int64, entries []raftpb.Entry) (uint64, int64, error) { if len(entries) == 0 { return prevLastIndex, prevRaftLogSize, nil } var diff enginepb.MVCCStats ctx := context.Background() for i := range entries { ent := &entries[i] key := keys.RaftLogKey(r.RangeID, ent.Index) if err := engine.MVCCPutProto(ctx, batch, &diff, key, hlc.ZeroTimestamp, nil /* txn */, ent); err != nil { return 0, 0, err } } lastIndex := entries[len(entries)-1].Index // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(ctx, batch, &diff, keys.RaftLogKey(r.RangeID, i), hlc.ZeroTimestamp, nil /* txn */) if err != nil { return 0, 0, err } } if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil { return 0, 0, err } raftLogSize := prevRaftLogSize + diff.SysBytes return lastIndex, raftLogSize, nil }
// append the given entries to the raft log. func (r *Replica) append(batch engine.Engine, entries []raftpb.Entry) error { if len(entries) == 0 { return nil } for _, ent := range entries { err := engine.MVCCPutProto(batch, nil, keys.RaftLogKey(r.RangeID, ent.Index), roachpb.ZeroTimestamp, nil, &ent) if err != nil { return err } } lastIndex := entries[len(entries)-1].Index prevLastIndex := atomic.LoadUint64(&r.lastIndex) // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(batch, nil, keys.RaftLogKey(r.RangeID, i), roachpb.ZeroTimestamp, nil) if err != nil { return err } } // Commit the batch and update the last index. if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil { return err } batch.Defer(func() { atomic.StoreUint64(&r.lastIndex, lastIndex) }) return nil }
// append the given entries to the raft log. Takes the previous value // of r.lastIndex and returns a new value. We do this rather than // modifying r.lastIndex directly because this modification needs to // be atomic with the commit of the batch. func (r *Replica) append(batch engine.ReadWriter, prevLastIndex uint64, entries []raftpb.Entry) (uint64, error) { if len(entries) == 0 { return prevLastIndex, nil } for i := range entries { ent := &entries[i] key := keys.RaftLogKey(r.RangeID, ent.Index) if err := engine.MVCCPutProto(context.Background(), batch, nil, key, hlc.ZeroTimestamp, nil, ent); err != nil { return 0, err } } lastIndex := entries[len(entries)-1].Index // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(context.Background(), batch, nil, keys.RaftLogKey(r.RangeID, i), hlc.ZeroTimestamp, nil) if err != nil { return 0, err } } if err := setLastIndex(batch, r.RangeID, lastIndex); err != nil { return 0, err } return lastIndex, nil }
// Del removes all abort cache entries for the given transaction. func (sc *AbortCache) Del( ctx context.Context, e engine.Engine, ms *engine.MVCCStats, txnID *uuid.UUID, ) error { key := keys.AbortCacheKey(sc.rangeID, txnID) return engine.MVCCDelete(ctx, e, ms, key, roachpb.ZeroTimestamp, nil /* txn */) }
// append the given entries to the raft log. Takes the previous values of // r.mu.lastIndex and r.mu.raftLogSize, and returns new values. We do this // rather than modifying them directly because these modifications need to be // atomic with the commit of the batch. func (r *Replica) append( ctx context.Context, batch engine.ReadWriter, prevLastIndex uint64, prevRaftLogSize int64, entries []raftpb.Entry, ) (uint64, int64, error) { if len(entries) == 0 { return prevLastIndex, prevRaftLogSize, nil } var diff enginepb.MVCCStats var value roachpb.Value for i := range entries { ent := &entries[i] key := keys.RaftLogKey(r.RangeID, ent.Index) if err := value.SetProto(ent); err != nil { return 0, 0, err } value.InitChecksum(key) var err error if ent.Index > prevLastIndex { err = engine.MVCCBlindPut(ctx, batch, &diff, key, hlc.ZeroTimestamp, value, nil /* txn */) } else { err = engine.MVCCPut(ctx, batch, &diff, key, hlc.ZeroTimestamp, value, nil /* txn */) } if err != nil { return 0, 0, err } } // Delete any previously appended log entries which never committed. lastIndex := entries[len(entries)-1].Index for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(ctx, batch, &diff, keys.RaftLogKey(r.RangeID, i), hlc.ZeroTimestamp, nil /* txn */) if err != nil { return 0, 0, err } } if err := setLastIndex(ctx, batch, r.RangeID, lastIndex); err != nil { return 0, 0, err } raftLogSize := prevRaftLogSize + diff.SysBytes return lastIndex, raftLogSize, nil }
// Append implements the multiraft.WriteableGroupStorage interface. func (r *Replica) Append(entries []raftpb.Entry) error { if len(entries) == 0 { return nil } batch := r.rm.Engine().NewBatch() defer batch.Close() rangeID := r.Desc().RangeID for _, ent := range entries { err := engine.MVCCPutProto(batch, nil, keys.RaftLogKey(rangeID, ent.Index), proto.ZeroTimestamp, nil, &ent) if err != nil { return err } } lastIndex := entries[len(entries)-1].Index prevLastIndex := atomic.LoadUint64(&r.lastIndex) // Delete any previously appended log entries which never committed. for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(batch, nil, keys.RaftLogKey(rangeID, i), proto.ZeroTimestamp, nil) if err != nil { return err } } // Commit the batch and update the last index. if err := setLastIndex(batch, rangeID, lastIndex); err != nil { return err } if err := batch.Commit(); err != nil { return err } atomic.StoreUint64(&r.lastIndex, lastIndex) return nil }
// ApplySnapshot implements the multiraft.WriteableGroupStorage interface. func (r *Range) ApplySnapshot(snap raftpb.Snapshot) error { snapData := proto.RaftSnapshotData{} err := gogoproto.Unmarshal(snap.Data, &snapData) if err != nil { return err } // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. hardStateKey := keys.RaftHardStateKey(r.Desc().RaftID) hardState, _, err := engine.MVCCGet(r.rm.Engine(), hardStateKey, proto.ZeroTimestamp, true /* consistent */, nil) if err != nil { return err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor batch := r.rm.Engine().NewBatch() defer batch.Close() // Delete everything in the range and recreate it from the snapshot. for iter := newRangeDataIterator(&desc, r.rm.Engine()); iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return err } } // Write the snapshot into the range. for _, kv := range snapData.KV { if err := batch.Put(kv.Key, kv.Value); err != nil { return err } } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, proto.ZeroTimestamp, nil) if err != nil { return err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, proto.ZeroTimestamp, *hardState, nil) if err != nil { return err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RaftID) if err != nil { return err } // Copy range stats to new range. oldStats := r.stats r.stats, err = newRangeStats(desc.RaftID, batch) if err != nil { r.stats = oldStats return err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, r.Desc().RaftID, snap.Metadata.Index); err != nil { return err } if err := batch.Commit(); err != nil { return err } // As outlined above, last and applied index are the same after applying // the snapshot. atomic.StoreUint64(&r.lastIndex, snap.Metadata.Index) atomic.StoreUint64(&r.appliedIndex, snap.Metadata.Index) // Atomically update the descriptor and lease. if err := r.setDesc(&desc); err != nil { return err } atomic.StorePointer(&r.lease, unsafe.Pointer(lease)) return nil }
// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(batch engine.Engine, snap raftpb.Snapshot) (uint64, error) { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } rangeID := r.RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. This is // usually unnecessary because a snapshot is nearly always // accompanied by a new HardState which incorporates both our former // state and new information from the leader, but in the event that // the HardState has not changed, we want to use our own previous // HardState and not one that was transmitted via the snapshot. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(batch, hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. if _, err := r.append(batch, 0, snapData.LogEntries); err != nil { return 0, err } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return 0, err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return 0, err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return 0, err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return 0, err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return 0, err } batch.Defer(func() { // Update the range stats. r.stats.Replace(newStats) r.mu.Lock() // As outlined above, last and applied index are the same after applying // the snapshot. r.mu.appliedIndex = snap.Metadata.Index r.mu.leaderLease = lease r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } }) return snap.Metadata.Index, nil }
// Delete deletes the key and value specified by key. func (r *Range) Delete(batch engine.Engine, ms *engine.MVCCStats, args *proto.DeleteRequest, reply *proto.DeleteResponse) { reply.SetGoError(engine.MVCCDelete(batch, ms, args.Key, args.Timestamp, args.Txn)) }
// Delete deletes the key and value specified by key. func (r *Range) Delete(batch engine.Engine, ms *engine.MVCCStats, args proto.DeleteRequest) (proto.DeleteResponse, error) { var reply proto.DeleteResponse return reply, engine.MVCCDelete(batch, ms, args.Key, args.Timestamp, args.Txn) }
// applySnapshot updates the replica based on the given snapshot. func (r *Replica) applySnapshot(snap raftpb.Snapshot) error { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return err } rangeID := r.Desc().RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(r.store.Engine(), hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor batch := r.store.Engine().NewBatch() defer batch.Close() // Delete everything in the range and recreate it from the snapshot. iter := newReplicaDataIterator(&desc, r.store.Engine()) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return err } } // Write the snapshot into the range. for _, kv := range snapData.KV { mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return err } } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return err } if err := batch.Commit(); err != nil { return err } // Update the range stats. r.stats.Replace(newStats) // As outlined above, last and applied index are the same after applying // the snapshot. atomic.StoreUint64(&r.lastIndex, snap.Metadata.Index) atomic.StoreUint64(&r.appliedIndex, snap.Metadata.Index) // Atomically update the descriptor and lease. if err := r.setDesc(&desc); err != nil { return err } // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarangee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(); err != nil { return err } atomic.StorePointer(&r.lease, unsafe.Pointer(lease)) return nil }