// Send forwards the call to the single store. This is a poor man's // version of kv.TxnCoordSender, but it serves the purposes of // supporting tests in this package. Transactions are not supported. // Since kv/ depends on storage/, we can't get access to a // TxnCoordSender from here. // TODO(tschottdorf): {kv->storage}.LocalSender func (db *testSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if et, ok := ba.GetArg(roachpb.EndTransaction); ok { return nil, roachpb.NewError(util.Errorf("%s method not supported", et.Method())) } // Lookup range and direct request. key, endKey := keys.Range(ba) rng := db.store.LookupReplica(key, endKey) if rng == nil { return nil, roachpb.NewError(roachpb.NewRangeKeyMismatchError(key, endKey, nil)) } ba.RangeID = rng.Desc().RangeID replica := rng.GetReplica() if replica == nil { return nil, roachpb.NewError(util.Errorf("own replica missing in range")) } ba.Replica = *replica br, pErr := db.store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(db.store, br)) } if pErr != nil { return nil, pErr } return br, nil }
// sendRPC sends one or more RPCs to replicas from the supplied roachpb.Replica // slice. Returns an RPC error if the request could not be sent. Note // that the reply may contain a higher level error and must be checked in // addition to the RPC error. // The replicas are assume to have been ordered by preference, closer ones (if // any) at the front. func (ds *DistSender) sendRPC( ctx context.Context, rangeID roachpb.RangeID, replicas ReplicaSlice, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, error) { if len(replicas) == 0 { return nil, noNodeAddrsAvailError{} } // TODO(pmattis): This needs to be tested. If it isn't set we'll // still route the request appropriately by key, but won't receive // RangeNotFoundErrors. ba.RangeID = rangeID // Set RPC opts with stipulation that one of N RPCs must succeed. rpcOpts := SendOptions{ SendNextTimeout: ds.sendNextTimeout, Timeout: base.NetworkTimeout, Context: ctx, transportFactory: ds.transportFactory, } tracing.AnnotateTrace() defer tracing.AnnotateTrace() reply, err := ds.sendToReplicas(rpcOpts, rangeID, replicas, ba, ds.rpcContext) if err != nil { return nil, err } return reply, nil }
// sendRPC sends one or more RPCs to replicas from the supplied roachpb.Replica // slice. First, replicas which have gossiped addresses are corralled (and // rearranged depending on proximity and whether the request needs to go to a // leader) and then sent via Send, with requirement that one RPC to a server // must succeed. Returns an RPC error if the request could not be sent. Note // that the reply may contain a higher level error and must be checked in // addition to the RPC error. func (ds *DistSender) sendRPC(trace opentracing.Span, rangeID roachpb.RangeID, replicas ReplicaSlice, order orderingPolicy, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if len(replicas) == 0 { return nil, roachpb.NewError(noNodeAddrsAvailError{}) } // TODO(pmattis): This needs to be tested. If it isn't set we'll // still route the request appropriately by key, but won't receive // RangeNotFoundErrors. ba.RangeID = rangeID // Set RPC opts with stipulation that one of N RPCs must succeed. rpcOpts := SendOptions{ Ordering: order, SendNextTimeout: defaultSendNextTimeout, Timeout: rpc.DefaultRPCTimeout, Trace: trace, } tracing.AnnotateTrace() defer tracing.AnnotateTrace() reply, err := ds.rpcSend(rpcOpts, replicas, ba, ds.rpcContext) if err != nil { return nil, roachpb.NewError(err) } return reply.(*roachpb.BatchResponse), nil }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { var store *Store var err error // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { var repl *roachpb.ReplicaDescriptor var rangeID roachpb.RangeID rs := keys.Range(ba) rangeID, repl, err = ls.lookupReplica(rs.Key, rs.EndKey) if err == nil { ba.RangeID = rangeID ba.Replica = *repl } } ctx = log.Add(ctx, log.RangeID, ba.RangeID) if err == nil { store, err = ls.GetStore(ba.Replica.StoreID) } if err != nil { return nil, roachpb.NewError(err) } sp, cleanupSp := tracing.SpanFromContext(opStores, store.Tracer(), ctx) defer cleanupSp() if ba.Txn != nil { // For calls that read data within a txn, we keep track of timestamps // observed from the various participating nodes' HLC clocks. If we have // a timestamp on file for this Node which is smaller than MaxTimestamp, // we can lower MaxTimestamp accordingly. If MaxTimestamp drops below // OrigTimestamp, we effectively can't see uncertainty restarts any // more. // Note that it's not an issue if MaxTimestamp propagates back out to // the client via a returned Transaction update - when updating a Txn // from another, the larger MaxTimestamp wins. if maxTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID); ok && maxTS.Less(ba.Txn.MaxTimestamp) { // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn // The uncertainty window is [OrigTimestamp, maxTS), so if that window // is empty, there won't be any uncertainty restarts. if !ba.Txn.OrigTimestamp.Less(maxTS) { sp.LogEvent("read has no clock uncertainty") } shallowTxn.MaxTimestamp.Backward(maxTS) ba.Txn = &shallowTxn } } br, pErr := store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { sp := tracing.SpanFromContext(ctx) var store *Store var pErr *roachpb.Error // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { var repl *roachpb.ReplicaDescriptor var rangeID roachpb.RangeID rs := keys.Range(ba) rangeID, repl, pErr = ls.lookupReplica(rs.Key, rs.EndKey) if pErr == nil { ba.RangeID = rangeID ba.Replica = *repl } } ctx = log.Add(ctx, log.RangeID, ba.RangeID) if pErr == nil { store, pErr = ls.GetStore(ba.Replica.StoreID) } var br *roachpb.BatchResponse if pErr != nil { return nil, pErr } // For calls that read data within a txn, we can avoid uncertainty // related retries in certain situations. If the node is in // "CertainNodes", we need not worry about uncertain reads any // more. Setting MaxTimestamp=OrigTimestamp for the operation // accomplishes that. See roachpb.Transaction.CertainNodes for details. if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) { // MaxTimestamp = Timestamp corresponds to no clock uncertainty. sp.LogEvent("read has no clock uncertainty") // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn // We set to OrigTimestamp because that works for both SNAPSHOT and // SERIALIZABLE: If we used Timestamp instead, we could run into // unnecessary retries at SNAPSHOT. For example, a SNAPSHOT txn at // OrigTimestamp = 1000.0, Timestamp = 2000.0, MaxTimestamp = 3000.0 // will always read at 1000, so a MaxTimestamp of 2000 will still let // it restart with uncertainty when it finds a value in (1000, 2000). shallowTxn.MaxTimestamp = ba.Txn.OrigTimestamp ba.Txn = &shallowTxn } br, pErr = store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } gcq.eventLog.VInfof(true, "completed with stats %+v", info) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { return pErr.GoError() } return nil }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { sp := tracing.SpanFromContext(ctx) var store *Store var pErr *roachpb.Error // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { var repl *roachpb.ReplicaDescriptor var rangeID roachpb.RangeID rs := keys.Range(ba) rangeID, repl, pErr = ls.lookupReplica(rs.Key, rs.EndKey) if pErr == nil { ba.RangeID = rangeID ba.Replica = *repl } } ctx = log.Add(ctx, log.RangeID, ba.RangeID) if pErr == nil { store, pErr = ls.GetStore(ba.Replica.StoreID) } var br *roachpb.BatchResponse if pErr != nil { return nil, pErr } // For calls that read data within a txn, we can avoid uncertainty // related retries in certain situations. If the node is in // "CertainNodes", we need not worry about uncertain reads any // more. Setting MaxTimestamp=Timestamp for the operation // accomplishes that. See roachpb.Transaction.CertainNodes for details. if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) { // MaxTimestamp = Timestamp corresponds to no clock uncertainty. sp.LogEvent("read has no clock uncertainty") // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn shallowTxn.MaxTimestamp = ba.Txn.Timestamp ba.Txn = &shallowTxn } br, pErr = store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *LocalSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { trace := tracer.FromCtx(ctx) var store *storage.Store var err error // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { var repl *roachpb.ReplicaDescriptor var rangeID roachpb.RangeID key, endKey := keys.Range(ba) rangeID, repl, err = ls.lookupReplica(key, endKey) if err == nil { ba.RangeID = rangeID ba.Replica = *repl } } ctx = log.Add(ctx, log.RangeID, ba.RangeID) if err == nil { store, err = ls.GetStore(ba.Replica.StoreID) } var br *roachpb.BatchResponse if err != nil { return nil, roachpb.NewError(err) } // For calls that read data within a txn, we can avoid uncertainty // related retries in certain situations. If the node is in // "CertainNodes", we need not worry about uncertain reads any // more. Setting MaxTimestamp=Timestamp for the operation // accomplishes that. See roachpb.Transaction.CertainNodes for details. if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) { // MaxTimestamp = Timestamp corresponds to no clock uncertainty. trace.Event("read has no clock uncertainty") ba.Txn.MaxTimestamp = ba.Txn.Timestamp } br, pErr := store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
func TestStoreExecuteNoop(t *testing.T) { defer leaktest.AfterTest(t) store, _, stopper := createTestStore(t) defer stopper.Stop() ba := roachpb.BatchRequest{} ba.Key = nil // intentional ba.RangeID = 1 ba.Replica = roachpb.ReplicaDescriptor{StoreID: store.StoreID()} ba.Add(&roachpb.GetRequest{RequestHeader: roachpb.RequestHeader{Key: roachpb.Key("a")}}) ba.Add(&roachpb.NoopRequest{}) br, pErr := store.Send(context.Background(), ba) if pErr != nil { t.Error(pErr) } reply := br.Responses[1].GetInner() if _, ok := reply.(*roachpb.NoopResponse); !ok { t.Errorf("expected *roachpb.NoopResponse, got %T", reply) } }
// sendRPC sends one or more RPCs to replicas from the supplied roachpb.Replica // slice. First, replicas which have gossiped addresses are corralled (and // rearranged depending on proximity and whether the request needs to go to a // leader) and then sent via rpc.Send, with requirement that one RPC to a // server must succeed. Returns an RPC error if the request could not be sent. // Note that the reply may contain a higher level error and must be checked in // addition to the RPC error. func (ds *DistSender) sendRPC(trace *tracer.Trace, rangeID roachpb.RangeID, replicas replicaSlice, order rpc.OrderingPolicy, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if len(replicas) == 0 { return nil, roachpb.NewError(noNodeAddrsAvailError{}) } // Build a slice of replica addresses. addrs := make([]net.Addr, 0, len(replicas)) replicaMap := make(map[string]*roachpb.ReplicaDescriptor, len(replicas)) for i := range replicas { addr := replicas[i].NodeDesc.Address addrs = append(addrs, addr) replicaMap[addr.String()] = &replicas[i].ReplicaDescriptor } // TODO(pmattis): This needs to be tested. If it isn't set we'll // still route the request appropriately by key, but won't receive // RangeNotFoundErrors. ba.RangeID = rangeID // Set RPC opts with stipulation that one of N RPCs must succeed. rpcOpts := rpc.Options{ N: 1, Ordering: order, SendNextTimeout: defaultSendNextTimeout, Timeout: defaultRPCTimeout, Trace: trace, } // getArgs clones the arguments on demand for all but the first replica. firstArgs := true getArgs := func(addr net.Addr) proto.Message { var a *roachpb.BatchRequest // Use the supplied args proto if this is our first address. if firstArgs { firstArgs = false a = &ba } else { // Otherwise, copy the args value and set the replica in the header. a = proto.Clone(&ba).(*roachpb.BatchRequest) } if addr != nil { // TODO(tschottdorf): see len(replicas) above. a.Replica = *replicaMap[addr.String()] } return a } // RPCs are sent asynchronously and there is no synchronized access to // the reply object, so we don't pass itself to RPCSend. // Otherwise there maybe a race case: // If the RPC call times out using our original reply object, // we must not use it any more; the rpc call might still return // and just write to it at any time. // args.CreateReply() should be cheaper than proto.Clone which use reflect. getReply := func() proto.Message { return &roachpb.BatchResponse{} } const method = "Node.Batch" replies, err := ds.rpcSend(rpcOpts, method, addrs, getArgs, getReply, ds.rpcContext) if err != nil { return nil, roachpb.NewError(err) } return replies[0].(*roachpb.BatchResponse), nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and sequence cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new sequence cache entry // * obtaining the transaction for a sequence cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // sequence cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the sequence table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap, true /* replicatedOnly */) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return util.Errorf("could not find zone config for range %s: %s", repl, err) } gc := engine.NewGarbageCollector(now, zone.GC) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() gcArgs := &roachpb.GCRequest{} // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[uuid.UUID]*roachpb.Transaction{} intentSpanMap := map[uuid.UUID][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. var intentCount int processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { txnID := *meta.Txn.ID txn := &roachpb.Transaction{ TxnMeta: *meta.Txn, } txnMap[txnID] = txn intentCount++ intentSpanMap[txnID] = append(intentSpanMap[txnID], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { iterKey := iter.Key() if !iterKey.IsValue() || !iterKey.Key.Equal(expBaseKey) { // Moving to the next key (& values). processKeysAndValues() expBaseKey = iterKey.Key if !iterKey.IsValue() { keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} continue } // An implicit metadata. keys = []engine.MVCCKey{engine.MakeMVCCMetadataKey(iterKey.Key)} // A nil value for the encoded MVCCMetadata. This will unmarshal to an // empty MVCCMetadata which is sufficient for processKeysAndValues to // determine that there is no intent. vals = [][]byte{nil} } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() gcq.eventLog.Infof(true, "assembled %d transactions from %d old intents; found %d gc'able keys", len(txnMap), intentCount, len(gcArgs.Keys)) txnKeys, err := gcq.processTransactionTable(repl, txnMap, txnExp) if err != nil { return err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcArgs.Keys = append(gcArgs.Keys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup gcq.eventLog.Infof(true, "pushing %d txns", len(txnMap)) for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) go gcq.pushTxn(repl, now, txn, roachpb.PUSH_ABORT, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for txnID, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[txnID] { intents = append(intents, roachpb.Intent{Span: intent, Status: txn.Status, Txn: txn.TxnMeta}) } } } gcq.eventLog.Infof(true, "resolving %d intents", len(intents)) if pErr := repl.store.intentResolver.resolveIntents(repl.context(), repl, intents, true /* wait */, false /* !poison */); pErr != nil { return pErr.GoError() } // Deal with any leftover sequence cache keys. There shouldn't be many of // them. leftoverSeqCacheKeys := gcq.processSequenceCache(repl, now, txnExp, txnMap) gcq.eventLog.Infof(true, "collected %d leftover sequence cache keys", len(leftoverSeqCacheKeys)) gcArgs.Keys = append(gcArgs.Keys, leftoverSeqCacheKeys...) gcq.eventLog.Infof(true, "sending gc request for %d keys", len(gcArgs.Keys)) var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } return nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return fmt.Errorf("could not find GC policy for range %s: %s", repl, err) } policy := zone.GC gcMeta := roachpb.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, *policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() gcArgs := &roachpb.GCRequest{} // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*roachpb.Transaction{} intentSpanMap := map[string][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentSpanMap[id] = append(intentSpanMap[id], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err) continue } if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() txnKeys, err := processTransactionTable(repl, txnMap, txnExp) if err != nil { return err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcArgs.Keys = append(gcArgs.Keys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) go pushTxn(repl, now, txn, roachpb.ABORT_TXN, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for id, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[id] { intents = append(intents, roachpb.Intent{Span: intent, Txn: *txn}) } } } if err := repl.resolveIntents(repl.context(), intents, true /* wait */, false /* !poison */); err != nil { return err } // Deal with any leftover sequence cache keys. There shouldn't be many of // them. gcArgs.Keys = append(gcArgs.Keys, processSequenceCache(repl, now, txnExp, txnMap)...) // Send GC request through range. gcArgs.GCMeta = *gcMeta var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease := p.RequestPending(); nextLease != nil { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.ctx.rangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(func() { // Propose a RequestLease command and wait for it to apply. var execPErr *roachpb.Error ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) // Send lease request directly to raft in order to skip unnecessary // checks from normal request machinery, (e.g. the command queue). // Note that the command itself isn't traced, but usually the caller // waiting for the result has an active Trace. ch, _, err := replica.proposeRaftCommand( replica.context(context.Background()), ba) if err != nil { execPErr = roachpb.NewError(err) } else { // If the command was committed, wait for the range to apply it. select { case c := <-ch: if c.Err != nil { if log.V(1) { log.Infof("failed to acquire lease for replica %s: %s", replica.store, c.Err) } execPErr = c.Err } case <-replica.store.Stopper().ShouldQuiesce(): execPErr = roachpb.NewError( replica.newNotLeaseHolderError(nil, replica.store.StoreID(), replica.Desc())) } } // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for i, llChan := range p.llChans { // Don't send the same pErr object twice; this can lead to races. We could // clone every time but it's more efficient to send pErr itself to one of // the channels (the last one; if we send it earlier the race can still // happen). if i == len(p.llChans)-1 { llChan <- execPErr } else { llChan <- protoutil.Clone(execPErr).(*roachpb.Error) // works with `nil` } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( replica.newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return fmt.Errorf("could not find GC policy for range %s: %s", repl, err) } policy := zone.GC gcMeta := roachpb.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, *policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() // TODO(tschottdorf): execution will use a leader-assigned local // timestamp to compute intent age. While this should be fine, could // consider adding a Now timestamp to GCRequest which would be used // instead. gcArgs := &roachpb.GCRequest{} var mu sync.Mutex var oldestIntentNanos int64 = math.MaxInt64 var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*roachpb.Transaction{} intentSpanMap := map[string][]roachpb.Span{} // updateOldestIntent atomically updates the oldest intent. updateOldestIntent := func(intentNanos int64) { mu.Lock() defer mu.Unlock() if intentNanos < oldestIntentNanos { oldestIntentNanos = intentNanos } } // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentSpanMap[id] = append(intentSpanMap[id], roachpb.Span{Key: expBaseKey}) } else { updateOldestIntent(meta.Txn.OrigTimestamp.WallTime) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err) continue } if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { wg.Add(1) go gcq.pushTxn(repl, now, txn, updateOldestIntent, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for id, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[id] { intents = append(intents, roachpb.Intent{Span: intent, Txn: *txn}) } } } done := true if len(intents) > 0 { done = false repl.resolveIntents(repl.context(), intents) } // Set start and end keys. if len(gcArgs.Keys) > 0 { done = false gcArgs.Key = gcArgs.Keys[0].Key gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key.Next() } if done { return nil } // Send GC request through range. gcMeta.OldestIntentNanos = proto.Int64(oldestIntentNanos) gcArgs.GCMeta = *gcMeta var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }