// Send implements the batch.Sender interface. It subdivides // the Batch into batches admissible for sending (preventing certain // illegal mixtures of requests), executes each individual part // (which may span multiple ranges), and recombines the response. func (ds *DistSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. // TODO(tschottdorf): right place for this? if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(roachpb.ZeroTimestamp) { // Make sure that after the call, args hasn't changed. defer func(timestamp roachpb.Timestamp) { ba.Timestamp = timestamp }(ba.Timestamp) ba.Timestamp = ds.clock.Now() } if ba.Txn != nil && len(ba.Txn.CertainNodes.Nodes) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): bad style to assume that ba.Txn is ours. // No race here, but should have a better way of doing this. // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). ba.Txn.CertainNodes.Add(nDesc.NodeID) } } // TODO(tschottdorf): provisional instantiation. return newChunkingSender(ds.sendChunk).Send(ctx, ba) }
// Send implements the batch.Sender interface. It subdivides // the Batch into batches admissible for sending (preventing certain // illegal mixtures of requests), executes each individual part // (which may span multiple ranges), and recombines the response. func (ds *DistSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. // TODO(tschottdorf): right place for this? if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(roachpb.ZeroTimestamp) { // Make sure that after the call, args hasn't changed. defer func(timestamp roachpb.Timestamp) { ba.Timestamp = timestamp }(ba.Timestamp) ba.Timestamp = ds.clock.Now() } // TODO(tschottdorf): provisional instantiation. return newChunkingSender(ds.sendChunk).Send(ctx, ba) }
func testPut() roachpb.BatchRequest { var ba roachpb.BatchRequest ba.Timestamp = testTS put := &roachpb.PutRequest{} put.Key = testKey ba.Add(put) return ba }
func (tc *TxnCoordSender) heartbeat(id string, trace *tracer.Trace, ctx context.Context) bool { tc.Lock() proceed := true txnMeta := tc.txns[id] // Before we send a heartbeat, determine whether this transaction // should be considered abandoned. If so, exit heartbeat. if txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) { // TODO(tschottdorf): should we be more proactive here? // The client might be continuing the transaction // through another coordinator, but in the most likely // case it's just gone and the open transaction record // could block concurrent operations. if log.V(1) { log.Infof("transaction %s abandoned; stopping heartbeat", txnMeta.txn) } proceed = false } // txnMeta.txn is possibly replaced concurrently, // so grab a copy before unlocking. txn := txnMeta.txn tc.Unlock() if !proceed { return false } hb := &roachpb.HeartbeatTxnRequest{} hb.Key = txn.Key ba := roachpb.BatchRequest{} ba.Timestamp = tc.clock.Now() ba.CmdID = ba.GetOrCreateCmdID(ba.Timestamp.WallTime) ba.Txn = txn.Clone() ba.Add(hb) epochEnds := trace.Epoch("heartbeat") _, err := tc.wrapped.Send(ctx, ba) epochEnds() // If the transaction is not in pending state, then we can stop // the heartbeat. It's either aborted or committed, and we resolve // write intents accordingly. if err != nil { log.Warningf("heartbeat to %s failed: %s", txn, err) } // TODO(bdarnell): once we have gotten a heartbeat response with // Status != PENDING, future heartbeats are useless. However, we // need to continue the heartbeatLoop until the client either // commits or abandons the transaction. We could save a little // pointless work by restructuring this loop to stop sending // heartbeats between the time that the transaction is aborted and // the client finds out. Furthermore, we could use this information // to send TransactionAbortedErrors to the client so it can restart // immediately instead of running until its EndTransaction. return true }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } gcq.eventLog.VInfof(true, "completed with stats %+v", info) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { return pErr.GoError() } return nil }
// Send implements the batch.Sender interface. It subdivides // the Batch into batches admissible for sending (preventing certain // illegal mixtures of requests), executes each individual part // (which may span multiple ranges), and recombines the response. // When the request spans ranges, it is split up and the corresponding // ranges queried serially, in ascending order. // In particular, the first write in a transaction may not be part of the first // request sent. This is relevant since the first write is a BeginTransaction // request, thus opening up a window of time during which there may be intents // of a transaction, but no entry. Pushing such a transaction will succeed, and // may lead to the transaction being aborted early. func (ds *DistSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(roachpb.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil && len(ba.Txn.CertainNodes.Nodes) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): bad style to assume that ba.Txn is ours. // No race here, but should have a better way of doing this. // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). ba.Txn.CertainNodes.Add(nDesc.NodeID) } } if len(ba.Requests) < 1 { panic("empty batch") } var rplChunks []*roachpb.BatchResponse parts := ba.Split(false /* don't split ET */) for len(parts) > 0 { part := parts[0] ba.Requests = part rpl, pErr, shouldSplitET := ds.sendChunk(ctx, ba) if shouldSplitET { // If we tried to send a single round-trip EndTransaction but // it looks like it's going to hit multiple ranges, split it // here and try again. if len(parts) != 1 { panic("EndTransaction not in last chunk of batch") } parts = ba.Split(true /* split ET */) if len(parts) != 2 { panic("split of final EndTransaction chunk resulted in != 2 parts") } continue } if pErr != nil { return nil, pErr } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.Txn.Update(rpl.Header().Txn) rplChunks = append(rplChunks, rpl) parts = parts[1:] } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) } *reply.Header() = rplChunks[len(rplChunks)-1].BatchResponse_Header return reply, nil }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } ba.CmdID = ba.GetOrCreateCmdID(tc.clock.PhysicalNow()) var startNS int64 // This is the earliest point at which the request has a ClientCmdID and/or // TxnID (if applicable). Begin a Trace which follows this request. trace := tc.tracer.NewTrace(tracer.Coord, &ba) defer trace.Finalize() defer trace.Epoch("sending batch")() ctx = tracer.ToCtx(ctx, trace) var id string // optional transaction ID if ba.Txn != nil { // If this request is part of a transaction... id = string(ba.Txn.ID) // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if ba.Txn.Writing && ba.IsTransactionWrite() { tc.Lock() _, ok := tc.txns[id] tc.Unlock() if !ok { return nil, roachpb.NewError(util.Errorf("transaction must not write on multiple coordinators")) } } // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if ba.IsReadOnly() { ba.Timestamp = ba.Txn.OrigTimestamp } else { ba.Timestamp = ba.Txn.Timestamp } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewError(util.Errorf("EndTransaction must not have a Key set")) } et.Key = ba.Txn.Key // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() if len(et.Intents) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewError(util.Errorf("client must not pass intents to EndTransaction")) } tc.Lock() txnMeta, metaOK := tc.txns[id] if id != "" && metaOK { et.Intents = txnMeta.intents() } tc.Unlock() if intents := ba.GetIntents(); len(intents) > 0 { // Writes in Batch, so EndTransaction is fine. Should add // outstanding intents to EndTransaction, though. // TODO(tschottdorf): possible issues when the batch fails, // but the intents have been added anyways. // TODO(tschottdorf): some of these intents may be covered // by others, for example {[a,b), a}). This can lead to // some extra requests when those are non-local to the txn // record. But it doesn't seem worth optimizing now. et.Intents = append(et.Intents, intents...) } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewError(util.Errorf("transaction is already committed or aborted")) } if len(et.Intents) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewError(util.Errorf("cannot commit a read-only transaction")) } if log.V(1) { for _, intent := range et.Intents { trace.Event(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GoError().(*roachpb.OpRequiresTxnError); ok { br, pErr = tc.resendWithTxn(ba) } if pErr := tc.updateState(ctx, ba, br, pErr); pErr != nil { return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(trace, *br.Txn) } return br, nil }
// Send implements the batch.Sender interface. It subdivides // the Batch into batches admissible for sending (preventing certain // illegal mixtures of requests), executes each individual part // (which may span multiple ranges), and recombines the response. // When the request spans ranges, it is split up and the corresponding // ranges queried serially, in ascending order. // In particular, the first write in a transaction may not be part of the first // request sent. This is relevant since the first write is a BeginTransaction // request, thus opening up a window of time during which there may be intents // of a transaction, but no entry. Pushing such a transaction will succeed, and // may lead to the transaction being aborted early. func (ds *DistSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { tracing.AnnotateTrace() // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(hlc.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil { // Make a copy here since the code below modifies it in different places. // TODO(tschottdorf): be smarter about this - no need to do it for // requests that don't get split. txnClone := ba.Txn.Clone() ba.Txn = &txnClone if len(ba.Txn.ObservedTimestamps) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). // // Taking care below to not modify any memory referenced from // our BatchRequest which may be shared with others. // // We already have a clone of our txn (see above), so we can // modify it freely. // // Zero the existing data. That makes sure that if we had // something of size zero but with capacity, we don't re-use the // existing space (which others may also use). This is just to // satisfy paranoia/OCD and not expected to matter in practice. ba.Txn.ResetObservedTimestamps() // OrigTimestamp is the HLC timestamp at which the Txn started, so // this effectively means no more uncertainty on this node. ba.Txn.UpdateObservedTimestamp(nDesc.NodeID, ba.Txn.OrigTimestamp) } } } if len(ba.Requests) < 1 { panic("empty batch") } if ba.MaxSpanRequestKeys != 0 { // Verify that the batch contains only specific range requests or the // Begin/EndTransactionRequest. Verify that a batch with a ReverseScan // only contains ReverseScan range requests. isReverse := ba.IsReverse() for _, req := range ba.Requests { inner := req.GetInner() switch inner.(type) { case *roachpb.ScanRequest, *roachpb.DeleteRangeRequest: // Accepted range requests. All other range requests are still // not supported. // TODO(vivek): don't enumerate all range requests. if isReverse { return nil, roachpb.NewErrorf("batch with limit contains both forward and reverse scans") } case *roachpb.BeginTransactionRequest, *roachpb.EndTransactionRequest, *roachpb.ReverseScanRequest: continue default: return nil, roachpb.NewErrorf("batch with limit contains %T request", inner) } } } var rplChunks []*roachpb.BatchResponse parts := ba.Split(false /* don't split ET */) if len(parts) > 1 && ba.MaxSpanRequestKeys != 0 { // We already verified above that the batch contains only scan requests of the same type. // Such a batch should never need splitting. panic("batch with MaxSpanRequestKeys needs splitting") } for len(parts) > 0 { part := parts[0] ba.Requests = part rpl, pErr, shouldSplitET := ds.sendChunk(ctx, ba) if shouldSplitET { // If we tried to send a single round-trip EndTransaction but // it looks like it's going to hit multiple ranges, split it // here and try again. if len(parts) != 1 { panic("EndTransaction not in last chunk of batch") } parts = ba.Split(true /* split ET */) if len(parts) != 2 { panic("split of final EndTransaction chunk resulted in != 2 parts") } continue } if pErr != nil { return nil, pErr } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.UpdateTxn(rpl.Txn) rplChunks = append(rplChunks, rpl) parts = parts[1:] } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) reply.CollectedSpans = append(reply.CollectedSpans, rpl.CollectedSpans...) } reply.BatchResponse_Header = rplChunks[len(rplChunks)-1].BatchResponse_Header return reply, nil }
// resolveIntents resolves the given intents. For those which are // local to the range, we submit directly to the local Raft instance; // all non-local intents are resolved asynchronously in a batch. If // `wait` is true, all operations are carried out synchronously and an // error is returned. Otherwise, the call returns without error as // soon as all local resolve commands have been **proposed** (not // executed). This ensures that if a waiting client retries // immediately after calling this function, it will not hit the same // intents again. func (ir *intentResolver) resolveIntents(ctx context.Context, r *Replica, intents []roachpb.Intent, wait bool, poison bool) error { // We're doing async stuff below; those need new traces. ctx, cleanup := tracing.EnsureContext(ctx, ir.store.Tracer()) defer cleanup() log.Trace(ctx, fmt.Sprintf("resolving intents [wait=%t]", wait)) var reqsRemote []roachpb.Request baLocal := roachpb.BatchRequest{} baLocal.Timestamp = ir.store.Clock().Now() for i := range intents { intent := intents[i] // avoids a race in `i, intent := range ...` var resolveArgs roachpb.Request var local bool // whether this intent lives on this Range { if len(intent.EndKey) == 0 { resolveArgs = &roachpb.ResolveIntentRequest{ Span: intent.Span, IntentTxn: intent.Txn, Status: intent.Status, Poison: poison, } local = r.ContainsKey(intent.Key) } else { resolveArgs = &roachpb.ResolveIntentRangeRequest{ Span: intent.Span, IntentTxn: intent.Txn, Status: intent.Status, Poison: poison, } local = r.ContainsKeyRange(intent.Key, intent.EndKey) } } // If the intent isn't (completely) local, we'll need to send an external request. // We'll batch them all up and send at the end. if local { baLocal.Add(resolveArgs) } else { reqsRemote = append(reqsRemote, resolveArgs) } } // The local batch goes directly to Raft. var wg sync.WaitGroup if len(baLocal.Requests) > 0 { action := func() error { // Trace this under the ID of the intent owner. // Create a new span though, since we do not want to pass a span // between goroutines or we risk use-after-finish. sp := r.store.Tracer().StartSpan("resolve intents") defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) // Always operate with a timeout when resolving intents: this // prevents rare shutdown timeouts in tests. ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() _, pErr := r.addWriteCmd(ctxWithTimeout, baLocal, &wg) return pErr.GoError() } wg.Add(1) if wait || !r.store.Stopper().RunLimitedAsyncTask(ir.sem, func() { if err := action(); err != nil { log.Warningf("unable to resolve local intents; %s", err) } }) { // Still run the task when draining. Our caller already has a task and // going async here again is merely for performance, but some intents // need to be resolved because they might block other tasks. See #1684. // Note that handleSkippedIntents has a TODO in case #1684 comes back. if err := action(); err != nil { return err } } } // Resolve all of the intents which aren't local to the Range. if len(reqsRemote) > 0 { b := &client.Batch{} b.InternalAddRequest(reqsRemote...) action := func() error { // TODO(tschottdorf): no tracing here yet. return r.store.DB().Run(b).GoError() } if wait || !r.store.Stopper().RunLimitedAsyncTask(ir.sem, func() { if err := action(); err != nil { log.Warningf("unable to resolve external intents: %s", err) } }) { // As with local intents, try async to not keep the caller waiting, but // when draining just go ahead and do it synchronously. See #1684. if err := action(); err != nil { return err } } } // Wait until the local ResolveIntents batch has been submitted to // raft. No-op if all were non-local. wg.Wait() return nil }
// processIntentsAsync asynchronously processes intents which were // encountered during another command but did not interfere with the // execution of that command. This occurs in two cases: inconsistent // reads and EndTransaction (which queues its own external intents for // processing via this method). The two cases are handled somewhat // differently and would be better served by different entry points, // but combining them simplifies the plumbing necessary in Replica. func (ir *intentResolver) processIntentsAsync(r *Replica, intents []intentsWithArg) { if len(intents) == 0 { return } now := r.store.Clock().Now() ctx := r.context(context.TODO()) stopper := r.store.Stopper() for _, item := range intents { if item.args.Method() != roachpb.EndTransaction { stopper.RunLimitedAsyncTask(ir.sem, func() { // Everything here is best effort; give up rather than waiting // too long (helps avoid deadlocks during test shutdown, // although this is imperfect due to the use of an // uninterruptible WaitGroup.Wait in beginCmds). ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() h := roachpb.Header{Timestamp: now} resolveIntents, pushErr := ir.maybePushTransactions(ctxWithTimeout, item.intents, h, roachpb.PUSH_TOUCH, true /* skipInFlight */) // resolveIntents with poison=true because we're resolving // intents outside of the context of an EndTransaction. // // Naively, it doesn't seem like we need to poison the abort // cache since we're pushing with PUSH_TOUCH - meaning that // the primary way our Push leads to aborting intents is that // of the transaction having timed out (and thus presumably no // client being around any more, though at the time of writing // we don't guarantee that). But there's another path in which // the Push comes back successful, namely that of the // transaction already having been aborted by someone else, in // which case the client may still be running. Thus, we must // poison. if err := ir.resolveIntents(ctxWithTimeout, r, resolveIntents, true /* wait */, true /* poison */); err != nil { log.Warningc(ctxWithTimeout, "failed to resolve intents: %s", err) return } if pushErr != nil { log.Warningc(ctxWithTimeout, "failed to push during intent resolution: %s", pushErr) return } }) } else { // EndTransaction stopper.RunLimitedAsyncTask(ir.sem, func() { ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() // For EndTransaction, we know the transaction is finalized so // we can skip the push and go straight to the resolve. // // This mechanism assumes that when an EndTransaction fails, // the client makes no assumptions about the result. For // example, an attempt to explicitly rollback the transaction // may succeed (triggering this code path), but the result may // not make it back to the client. if err := ir.resolveIntents(ctxWithTimeout, r, item.intents, true /* wait */, false /* !poison */); err != nil { log.Warningc(ctxWithTimeout, "failed to resolve intents: %s", err) return } // We successfully resolved the intents, so we're able to GC from // the txn span directly. var ba roachpb.BatchRequest ba.Timestamp = now txn := item.intents[0].Txn gcArgs := roachpb.GCRequest{ Span: roachpb.Span{ Key: r.Desc().StartKey.AsRawKey(), EndKey: r.Desc().EndKey.AsRawKey(), }, } gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{ Key: keys.TransactionKey(txn.Key, txn.ID), }) ba.Add(&gcArgs) if _, pErr := r.addWriteCmd(ctxWithTimeout, ba, nil /* nil */); pErr != nil { log.Warningf("could not GC completed transaction: %s", pErr) } }) } } }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and sequence cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new sequence cache entry // * obtaining the transaction for a sequence cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // sequence cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the sequence table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap, true /* replicatedOnly */) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return util.Errorf("could not find zone config for range %s: %s", repl, err) } gc := engine.NewGarbageCollector(now, zone.GC) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() gcArgs := &roachpb.GCRequest{} // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[uuid.UUID]*roachpb.Transaction{} intentSpanMap := map[uuid.UUID][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. var intentCount int processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { txnID := *meta.Txn.ID txn := &roachpb.Transaction{ TxnMeta: *meta.Txn, } txnMap[txnID] = txn intentCount++ intentSpanMap[txnID] = append(intentSpanMap[txnID], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { iterKey := iter.Key() if !iterKey.IsValue() || !iterKey.Key.Equal(expBaseKey) { // Moving to the next key (& values). processKeysAndValues() expBaseKey = iterKey.Key if !iterKey.IsValue() { keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} continue } // An implicit metadata. keys = []engine.MVCCKey{engine.MakeMVCCMetadataKey(iterKey.Key)} // A nil value for the encoded MVCCMetadata. This will unmarshal to an // empty MVCCMetadata which is sufficient for processKeysAndValues to // determine that there is no intent. vals = [][]byte{nil} } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() gcq.eventLog.Infof(true, "assembled %d transactions from %d old intents; found %d gc'able keys", len(txnMap), intentCount, len(gcArgs.Keys)) txnKeys, err := gcq.processTransactionTable(repl, txnMap, txnExp) if err != nil { return err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcArgs.Keys = append(gcArgs.Keys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup gcq.eventLog.Infof(true, "pushing %d txns", len(txnMap)) for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) go gcq.pushTxn(repl, now, txn, roachpb.PUSH_ABORT, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for txnID, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[txnID] { intents = append(intents, roachpb.Intent{Span: intent, Status: txn.Status, Txn: txn.TxnMeta}) } } } gcq.eventLog.Infof(true, "resolving %d intents", len(intents)) if pErr := repl.store.intentResolver.resolveIntents(repl.context(), repl, intents, true /* wait */, false /* !poison */); pErr != nil { return pErr.GoError() } // Deal with any leftover sequence cache keys. There shouldn't be many of // them. leftoverSeqCacheKeys := gcq.processSequenceCache(repl, now, txnExp, txnMap) gcq.eventLog.Infof(true, "collected %d leftover sequence cache keys", len(leftoverSeqCacheKeys)) gcArgs.Keys = append(gcArgs.Keys, leftoverSeqCacheKeys...) gcq.eventLog.Infof(true, "sending gc request for %d keys", len(gcArgs.Keys)) var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } return nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return fmt.Errorf("could not find GC policy for range %s: %s", repl, err) } policy := zone.GC gcMeta := roachpb.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, *policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() gcArgs := &roachpb.GCRequest{} // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*roachpb.Transaction{} intentSpanMap := map[string][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentSpanMap[id] = append(intentSpanMap[id], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err) continue } if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() txnKeys, err := processTransactionTable(repl, txnMap, txnExp) if err != nil { return err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcArgs.Keys = append(gcArgs.Keys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) go pushTxn(repl, now, txn, roachpb.ABORT_TXN, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for id, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[id] { intents = append(intents, roachpb.Intent{Span: intent, Txn: *txn}) } } } if err := repl.resolveIntents(repl.context(), intents, true /* wait */, false /* !poison */); err != nil { return err } // Deal with any leftover sequence cache keys. There shouldn't be many of // them. gcArgs.Keys = append(gcArgs.Keys, processSequenceCache(repl, now, txnExp, txnMap)...) // Send GC request through range. gcArgs.GCMeta = *gcMeta var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease := p.RequestPending(); nextLease != nil { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.ctx.rangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(func() { // Propose a RequestLease command and wait for it to apply. var execPErr *roachpb.Error ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) // Send lease request directly to raft in order to skip unnecessary // checks from normal request machinery, (e.g. the command queue). // Note that the command itself isn't traced, but usually the caller // waiting for the result has an active Trace. ch, _, err := replica.proposeRaftCommand( replica.context(context.Background()), ba) if err != nil { execPErr = roachpb.NewError(err) } else { // If the command was committed, wait for the range to apply it. select { case c := <-ch: if c.Err != nil { if log.V(1) { log.Infof("failed to acquire lease for replica %s: %s", replica.store, c.Err) } execPErr = c.Err } case <-replica.store.Stopper().ShouldQuiesce(): execPErr = roachpb.NewError( replica.newNotLeaseHolderError(nil, replica.store.StoreID(), replica.Desc())) } } // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for i, llChan := range p.llChans { // Don't send the same pErr object twice; this can lead to races. We could // clone every time but it's more efficient to send pErr itself to one of // the channels (the last one; if we send it earlier the race can still // happen). if i == len(p.llChans)-1 { llChan <- execPErr } else { llChan <- protoutil.Clone(execPErr).(*roachpb.Error) // works with `nil` } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( replica.newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// Send implements the batch.Sender interface. It subdivides // the Batch into batches admissible for sending (preventing certain // illegal mixtures of requests), executes each individual part // (which may span multiple ranges), and recombines the response. // When the request spans ranges, it is split up and the corresponding // ranges queried serially, in ascending order. // In particular, the first write in a transaction may not be part of the first // request sent. This is relevant since the first write is a BeginTransaction // request, thus opening up a window of time during which there may be intents // of a transaction, but no entry. Pushing such a transaction will succeed, and // may lead to the transaction being aborted early. func (ds *DistSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { tracing.AnnotateTrace() // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(roachpb.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil && len(ba.Txn.CertainNodes.Nodes) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). // // Taking care below to not modify any memory referenced from // our BatchRequest which may be shared with others. // First, get a shallow clone of our txn (since that holds the // NodeList struct). txnShallow := *ba.Txn // Next, zero out the NodeList pointer. That makes sure that // if we had something of size zero but with capacity, we don't // re-use the existing space (which others may also use). txnShallow.CertainNodes.Nodes = nil txnShallow.CertainNodes.Add(nDesc.NodeID) ba.Txn = &txnShallow } } if len(ba.Requests) < 1 { panic("empty batch") } var rplChunks []*roachpb.BatchResponse parts := ba.Split(false /* don't split ET */) for len(parts) > 0 { part := parts[0] ba.Requests = part rpl, pErr, shouldSplitET := ds.sendChunk(ctx, ba) if shouldSplitET { // If we tried to send a single round-trip EndTransaction but // it looks like it's going to hit multiple ranges, split it // here and try again. if len(parts) != 1 { panic("EndTransaction not in last chunk of batch") } parts = ba.Split(true /* split ET */) if len(parts) != 2 { panic("split of final EndTransaction chunk resulted in != 2 parts") } continue } if pErr != nil { return nil, pErr } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.Txn.Update(rpl.Header().Txn) rplChunks = append(rplChunks, rpl) parts = parts[1:] } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) } *reply.Header() = rplChunks[len(rplChunks)-1].BatchResponse_Header return reply, nil }
func (tc *TxnCoordSender) heartbeat(txnID uuid.UUID, trace opentracing.Span, ctx context.Context) bool { tc.Lock() proceed := true txnMeta := tc.txns[txnID] var intentSpans []roachpb.Span // Before we send a heartbeat, determine whether this transaction // should be considered abandoned. If so, exit heartbeat. if txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) { // TODO(tschottdorf): should we be more proactive here? // The client might be continuing the transaction // through another coordinator, but in the most likely // case it's just gone and the open transaction record // could block concurrent operations. if log.V(1) { log.Infof("transaction %s abandoned; stopping heartbeat", txnMeta.txn) } proceed = false // Grab the intents here to avoid potential race. intentSpans = txnMeta.intentSpans() txnMeta.keys.Clear() } // txnMeta.txn is possibly replaced concurrently, // so grab a copy before unlocking. txn := txnMeta.txn.Clone() tc.Unlock() ba := roachpb.BatchRequest{} ba.Timestamp = tc.clock.Now() ba.Txn = &txn if !proceed { // Actively abort the transaction and its intents since we assume it's abandoned. et := &roachpb.EndTransactionRequest{ Span: roachpb.Span{ Key: txn.Key, }, Commit: false, IntentSpans: intentSpans, } ba.Add(et) tc.stopper.RunAsyncTask(func() { // Use the wrapped sender since the normal Sender // does not allow clients to specify intents. // TODO(tschottdorf): not using the existing context here since that // leads to use-after-finish of the contained trace. Should fork off // before the goroutine. if _, pErr := tc.wrapped.Send(context.Background(), ba); pErr != nil { if log.V(1) { log.Warningf("abort due to inactivity failed for %s: %s ", txn, pErr) } } }) return false } hb := &roachpb.HeartbeatTxnRequest{} hb.Key = txn.Key ba.Add(hb) trace.LogEvent("heartbeat") _, err := tc.wrapped.Send(ctx, ba) // If the transaction is not in pending state, then we can stop // the heartbeat. It's either aborted or committed, and we resolve // write intents accordingly. if err != nil { log.Warningf("heartbeat to %s failed: %s", txn, err) } // TODO(bdarnell): once we have gotten a heartbeat response with // Status != PENDING, future heartbeats are useless. However, we // need to continue the heartbeatLoop until the client either // commits or abandons the transaction. We could save a little // pointless work by restructuring this loop to stop sending // heartbeats between the time that the transaction is aborted and // the client finds out. Furthermore, we could use this information // to send TransactionAbortedErrors to the client so it can restart // immediately instead of running until its EndTransaction. return true }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { // Start new or pick up active trace and embed its trace metadata into // header for use by RPC recipients. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp, cleanupSp := tracing.SpanFromContext(opTxnCoordSender, tc.tracer, ctx) defer cleanupSp() // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.Header.Trace == nil { ba.Header.Trace = &tracing.Span{} } if err := tc.tracer.Inject(sp, basictracer.Delegator, ba.Trace); err != nil { return nil, roachpb.NewError(err) } if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } var startNS int64 ba.SetNewRequest() // This is the earliest point at which the request has an ID (if // applicable). Begin a Trace which follows this request. ctx = opentracing.ContextWithSpan(ctx, sp) if ba.Txn != nil { // If this request is part of a transaction... txnID := *ba.Txn.ID // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if ba.Txn.Writing && ba.IsTransactionWrite() { tc.Lock() _, ok := tc.txns[txnID] tc.Unlock() if !ok { return nil, roachpb.NewErrorf("transaction must not write on multiple coordinators") } } // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if ba.IsReadOnly() { ba.Timestamp = ba.Txn.OrigTimestamp } else { ba.Timestamp = ba.Txn.Timestamp } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } tc.Lock() txnMeta, metaOK := tc.txns[txnID] if metaOK { et.IntentSpans = txnMeta.intentSpans() } tc.Unlock() if intentSpans := ba.GetIntentSpans(); len(intentSpans) > 0 { // Writes in Batch, so EndTransaction is fine. Should add // outstanding intents to EndTransaction, though. // TODO(tschottdorf): possible issues when the batch fails, // but the intents have been added anyways. // TODO(tschottdorf): some of these intents may be covered // by others, for example {[a,b), a}). This can lead to // some extra requests when those are non-local to the txn // record. But it doesn't seem worth optimizing now. et.IntentSpans = append(et.IntentSpans, intentSpans...) } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewErrorf("transaction is already committed or aborted") } if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewErrorf("cannot commit a read-only transaction") } if log.V(1) { for _, intent := range et.IntentSpans { sp.LogEvent(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(ctx, ba, br, pErr); pErr != nil { sp.LogEvent(fmt.Sprintf("error: %s", pErr)) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(sp, *br.Txn) } return br, nil }