// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(ctx, gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } log.VEventf(ctx, 1, "completed with stats %+v", info) info.updateMetrics(gcq.store.metrics) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold gcArgs.TxnSpanGCThreshold = info.TxnSpanGCThreshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { log.ErrEvent(ctx, pErr.String()) return pErr.GoError() } return nil }
// processIntentsAsync asynchronously processes intents which were // encountered during another command but did not interfere with the // execution of that command. This occurs in two cases: inconsistent // reads and EndTransaction (which queues its own external intents for // processing via this method). The two cases are handled somewhat // differently and would be better served by different entry points, // but combining them simplifies the plumbing necessary in Replica. func (ir *intentResolver) processIntentsAsync(r *Replica, intents []intentsWithArg) { now := r.store.Clock().Now() ctx := context.TODO() stopper := r.store.Stopper() for _, item := range intents { if item.args.Method() != roachpb.EndTransaction { if err := stopper.RunLimitedAsyncTask( ctx, ir.sem, true /* wait */, func(ctx context.Context) { // Everything here is best effort; give up rather than waiting // too long (helps avoid deadlocks during test shutdown, // although this is imperfect due to the use of an // uninterruptible WaitGroup.Wait in beginCmds). ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() h := roachpb.Header{Timestamp: now} resolveIntents, pushErr := ir.maybePushTransactions(ctxWithTimeout, item.intents, h, roachpb.PUSH_TOUCH, true /* skipInFlight */) // resolveIntents with poison=true because we're resolving // intents outside of the context of an EndTransaction. // // Naively, it doesn't seem like we need to poison the abort // cache since we're pushing with PUSH_TOUCH - meaning that // the primary way our Push leads to aborting intents is that // of the transaction having timed out (and thus presumably no // client being around any more, though at the time of writing // we don't guarantee that). But there are other paths in which // the Push comes back successful while the coordinating client // may still be active. Examples of this are when: // // - the transaction was aborted by someone else, but the // coordinating client may still be running. // - the transaction entry wasn't written yet, which at the // time of writing has our push abort it, leading to the // same situation as above. // // Thus, we must poison. if err := ir.resolveIntents(ctxWithTimeout, resolveIntents, true /* wait */, true /* poison */); err != nil { log.Warningf(ctx, "%s: failed to resolve intents: %s", r, err) return } if pushErr != nil { log.Warningf(ctx, "%s: failed to push during intent resolution: %s", r, pushErr) return } }); err != nil { log.Warningf(ctx, "failed to resolve intents: %s", err) return } } else { // EndTransaction if err := stopper.RunLimitedAsyncTask( ctx, ir.sem, true /* wait */, func(ctx context.Context) { ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() // For EndTransaction, we know the transaction is finalized so // we can skip the push and go straight to the resolve. // // This mechanism assumes that when an EndTransaction fails, // the client makes no assumptions about the result. For // example, an attempt to explicitly rollback the transaction // may succeed (triggering this code path), but the result may // not make it back to the client. if err := ir.resolveIntents(ctxWithTimeout, item.intents, true /* wait */, false /* !poison */); err != nil { log.Warningf(ctx, "%s: failed to resolve intents: %s", r, err) return } // We successfully resolved the intents, so we're able to GC from // the txn span directly. b := &client.Batch{} txn := item.intents[0].Txn txnKey := keys.TransactionKey(txn.Key, *txn.ID) // This is pretty tricky. Transaction keys are range-local and // so they are encoded specially. The key range addressed by // (txnKey, txnKey.Next()) might be empty (since Next() does // not imply monotonicity on the address side). Instead, we // send this request to a range determined using the resolved // transaction anchor, i.e. if the txn is anchored on // /Local/RangeDescriptor/"a"/uuid, the key range below would // be ["a", "a\x00"). However, the first range is special again // because the above procedure results in KeyMin, but we need // at least KeyLocalMax. // // #7880 will address this by making GCRequest less special and // thus obviating the need to cook up an artificial range here. var gcArgs roachpb.GCRequest { key := keys.MustAddr(txn.Key) if localMax := keys.MustAddr(keys.LocalMax); key.Less(localMax) { key = localMax } endKey := key.Next() gcArgs.Span = roachpb.Span{ Key: key.AsRawKey(), EndKey: endKey.AsRawKey(), } } gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{ Key: txnKey, }) b.AddRawRequest(&gcArgs) if err := ir.store.db.Run(ctx, b); err != nil { log.Warningf(ctx, "could not GC completed transaction anchored at %s: %s", roachpb.Key(txn.Key), err) return } }); err != nil { log.Warningf(ctx, "failed to resolve intents: %s", err) return } } } }