// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState(ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error { trace := tracer.FromCtx(ctx) newTxn := &roachpb.Transaction{} newTxn.Update(ba.GetTxn()) // TODO(tamird): remove this clone. It's currently needed to avoid race conditions. pErr = proto.Clone(pErr).(*roachpb.Error) err := pErr.GoError() // TODO(bdarnell): We're writing to errors here (and where using ErrorWithIndex); // since there's no concept of ownership copy-on-write is always preferable. switch t := err.(type) { case nil: newTxn.Update(br.Txn) // Move txn timestamp forward to response timestamp if applicable. // TODO(tschottdorf): see (*Replica).executeBatch and comments within. // Looks like this isn't necessary any more, nor did it prevent a bug // referenced in a TODO there. newTxn.Timestamp.Forward(br.Timestamp) case *roachpb.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(trace, t.Txn) case *roachpb.OpRequiresTxnError: panic("OpRequiresTxnError must not happen at this level") case *roachpb.ReadWithinUncertaintyIntervalError: // Mark the host as certain. See the protobuf comment for // Transaction.CertainNodes for details. if t.NodeID == 0 { panic("no replica set in header on uncertainty restart") } newTxn.Update(&t.Txn) newTxn.CertainNodes.Add(t.NodeID) // If the reader encountered a newer write within the uncertainty // interval, move the timestamp forward, just past that write or // up to MaxTimestamp, whichever comes first. candidateTS := newTxn.MaxTimestamp candidateTS.Backward(t.ExistingTimestamp.Add(0, 1)) newTxn.Timestamp.Forward(candidateTS) newTxn.Restart(ba.GetUserPriority(), newTxn.Priority, newTxn.Timestamp) t.Txn = *newTxn case *roachpb.TransactionAbortedError: trace.SetError() newTxn.Update(&t.Txn) // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Priority = t.Txn.Priority t.Txn = *newTxn // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(trace, t.Txn) case *roachpb.TransactionPushError: newTxn.Update(t.Txn) // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1)) newTxn.Restart(ba.GetUserPriority(), t.PusheeTxn.Priority-1, newTxn.Timestamp) t.Txn = newTxn case *roachpb.TransactionRetryError: newTxn.Update(&t.Txn) newTxn.Restart(ba.GetUserPriority(), t.Txn.Priority, newTxn.Timestamp) t.Txn = *newTxn case roachpb.TransactionRestartError: // Assertion: The above cases should exhaust all ErrorDetails which // carry a Transaction. if pErr.Detail != nil { panic(fmt.Sprintf("unhandled TransactionRestartError %T", err)) } default: trace.SetError() } return func() *roachpb.Error { if len(newTxn.ID) <= 0 { return pErr } id := string(newTxn.ID) tc.Lock() defer tc.Unlock() txnMeta := tc.txns[id] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation. // A tricky edge case is that of a transaction which "fails" on the // first writing request, but actually manages to write some intents // (for example, due to being multi-range). In this case, there will // be an error, but the transaction will be marked as Writing and the // coordinator must track the state, for the client's retry will be // performed with a Writing transaction which the coordinator rejects // unless it is tracking it (on top of it making sense to track it; // after all, it **has** laid down intents and only the coordinator // can augment a potential EndTransaction call). // consider re-using those. if intents := ba.GetIntentSpans(); len(intents) > 0 && (err == nil || newTxn.Writing) { if txnMeta == nil { if !newTxn.Writing { panic("txn with intents marked as non-writing") } txnMeta = &txnMetadata{ txn: *newTxn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[id] = txnMeta // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. If we ended up here with an error, we'll always start // the coordinator - the transaction has laid down intents, so // we expect it to be committed/aborted at some point in the // future. if _, isEnding := ba.GetArg(roachpb.EndTransaction); err != nil || !isEnding { trace.Event("coordinator spawns") if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(id) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(id) return roachpb.NewError(&roachpb.NodeUnavailableError{}) } } } for _, intent := range intents { txnMeta.addKeyRange(intent.Key, intent.EndKey) } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn = *newTxn if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } if err == nil { // For successful transactional requests, always send the updated txn // record back. br.Txn = newTxn } return pErr }() }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState(ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error { sp, cleanupSp := tracing.SpanFromContext(opTxnCoordSender, tc.tracer, ctx) defer cleanupSp() newTxn := &roachpb.Transaction{} newTxn.Update(ba.Txn) if pErr == nil { newTxn.Update(br.Txn) } else { newTxn.Update(pErr.GetTxn()) } // If the request was successful but we're in a transaction which needs to // restart but doesn't know it yet, let it restart now (as opposed to // waiting until EndTransaction). if pErr == nil && newTxn.Isolation == roachpb.SERIALIZABLE && !newTxn.OrigTimestamp.Equal(newTxn.Timestamp) { pErr = roachpb.NewErrorWithTxn(roachpb.NewTransactionRetryError(), br.Txn) br = nil } switch t := pErr.GetDetail().(type) { case nil: // Move txn timestamp forward to response timestamp if applicable. // TODO(tschottdorf): see (*Replica).executeBatch and comments within. // Looks like this isn't necessary any more, nor did it prevent a bug // referenced in a TODO there. newTxn.Timestamp.Forward(br.Timestamp) case *roachpb.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(sp, *pErr.GetTxn()) case *roachpb.OpRequiresTxnError: panic("OpRequiresTxnError must not happen at this level") case *roachpb.ReadWithinUncertaintyIntervalError: // If the reader encountered a newer write within the uncertainty // interval, we advance the txn's timestamp just past the last observed // timestamp from the node. restartTS, ok := newTxn.GetObservedTimestamp(pErr.OriginNode) if !ok { pErr = roachpb.NewError(util.Errorf("no observed timestamp for node %d found on uncertainty restart", pErr.OriginNode)) } else { newTxn.Timestamp.Forward(restartTS) newTxn.Restart(ba.UserPriority, newTxn.Priority, newTxn.Timestamp) } case *roachpb.TransactionAbortedError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(pErr.GetTxn().Timestamp) newTxn.Priority = pErr.GetTxn().Priority // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(sp, *newTxn) case *roachpb.TransactionPushError: // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1)) newTxn.Restart(ba.UserPriority, t.PusheeTxn.Priority-1, newTxn.Timestamp) case *roachpb.TransactionRetryError: newTxn.Restart(ba.UserPriority, pErr.GetTxn().Priority, newTxn.Timestamp) default: if pErr.GetTxn() != nil { if pErr.CanRetry() { panic("Retryable internal error must not happen at this level") } else { // Do not clean up the transaction here since the client might still // want to continue the transaction. For example, a client might // continue its transaction after receiving ConditionFailedError, which // can come from a unique index violation. } } } if pErr != nil && pErr.GetTxn() != nil { // Avoid changing existing errors because sometimes they escape into // goroutines and then there are races. Fairly sure there isn't one // here, but better safe than sorry. pErrShallow := *pErr pErrShallow.SetTxn(newTxn) pErr = &pErrShallow } if newTxn.ID == nil { return pErr } txnID := *newTxn.ID tc.Lock() defer tc.Unlock() txnMeta := tc.txns[txnID] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation. // A tricky edge case is that of a transaction which "fails" on the // first writing request, but actually manages to write some intents // (for example, due to being multi-range). In this case, there will // be an error, but the transaction will be marked as Writing and the // coordinator must track the state, for the client's retry will be // performed with a Writing transaction which the coordinator rejects // unless it is tracking it (on top of it making sense to track it; // after all, it **has** laid down intents and only the coordinator // can augment a potential EndTransaction call). See #3303. intents := ba.GetIntentSpans() if len(intents) > 0 && (pErr == nil || newTxn.Writing) { if txnMeta == nil { if !newTxn.Writing { panic("txn with intents marked as non-writing") } // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. If we ended up here with an error, we'll always start // the coordinator - the transaction has laid down intents, so // we expect it to be committed/aborted at some point in the // future. if _, isEnding := ba.GetArg(roachpb.EndTransaction); pErr != nil || !isEnding { sp.LogEvent("coordinator spawns") txnMeta = &txnMetadata{ txn: *newTxn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[txnID] = txnMeta if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(txnID) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(txnID) return roachpb.NewError(&roachpb.NodeUnavailableError{}) } } } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn = *newTxn if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) // Adding the intents even on error reduces the likelihood of dangling // intents blocking concurrent writers for extended periods of time. // See #3346. for _, intent := range intents { txnMeta.addKeyRange(intent.Key, intent.EndKey) } } if pErr == nil { // For successful transactional requests, always send the updated txn // record back. br.Txn = newTxn } return pErr }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } var startNS int64 ba.SetNewRequest() // This is the earliest point at which the request has an ID (if // applicable). Begin a Trace which follows this request. trace := tc.tracer.NewTrace(tracer.Coord, &ba) defer trace.Finalize() defer trace.Epoch("sending batch")() ctx = tracer.ToCtx(ctx, trace) var id string // optional transaction ID if ba.Txn != nil { // If this request is part of a transaction... id = string(ba.Txn.ID) // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if ba.Txn.Writing && ba.IsTransactionWrite() { tc.Lock() _, ok := tc.txns[id] tc.Unlock() if !ok { return nil, roachpb.NewError(util.Errorf("transaction must not write on multiple coordinators")) } } // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if ba.IsReadOnly() { ba.Timestamp = ba.Txn.OrigTimestamp } else { ba.Timestamp = ba.Txn.Timestamp } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewError(util.Errorf("EndTransaction must not have a Key set")) } et.Key = ba.Txn.Key // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewError(util.Errorf("client must not pass intents to EndTransaction")) } tc.Lock() txnMeta, metaOK := tc.txns[id] if id != "" && metaOK { et.IntentSpans = txnMeta.intentSpans() } tc.Unlock() if intentSpans := ba.GetIntentSpans(); len(intentSpans) > 0 { // Writes in Batch, so EndTransaction is fine. Should add // outstanding intents to EndTransaction, though. // TODO(tschottdorf): possible issues when the batch fails, // but the intents have been added anyways. // TODO(tschottdorf): some of these intents may be covered // by others, for example {[a,b), a}). This can lead to // some extra requests when those are non-local to the txn // record. But it doesn't seem worth optimizing now. et.IntentSpans = append(et.IntentSpans, intentSpans...) } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewError(util.Errorf("transaction is already committed or aborted")) } if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewError(util.Errorf("cannot commit a read-only transaction")) } if log.V(1) { for _, intent := range et.IntentSpans { trace.Event(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GoError().(*roachpb.OpRequiresTxnError); ok { br, pErr = tc.resendWithTxn(ba) } if pErr := tc.updateState(ctx, ba, br, pErr); pErr != nil { trace.Event(fmt.Sprintf("error: %s", pErr)) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(trace, *br.Txn) } return br, nil }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { // Start new or pick up active trace and embed its trace metadata into // header for use by RPC recipients. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp, cleanupSp := tracing.SpanFromContext(opTxnCoordSender, tc.tracer, ctx) defer cleanupSp() // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.Header.Trace == nil { ba.Header.Trace = &tracing.Span{} } if err := tc.tracer.Inject(sp, basictracer.Delegator, ba.Trace); err != nil { return nil, roachpb.NewError(err) } if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } var startNS int64 ba.SetNewRequest() // This is the earliest point at which the request has an ID (if // applicable). Begin a Trace which follows this request. ctx = opentracing.ContextWithSpan(ctx, sp) if ba.Txn != nil { // If this request is part of a transaction... txnID := *ba.Txn.ID // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if ba.Txn.Writing && ba.IsTransactionWrite() { tc.Lock() _, ok := tc.txns[txnID] tc.Unlock() if !ok { return nil, roachpb.NewErrorf("transaction must not write on multiple coordinators") } } // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if ba.IsReadOnly() { ba.Timestamp = ba.Txn.OrigTimestamp } else { ba.Timestamp = ba.Txn.Timestamp } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } tc.Lock() txnMeta, metaOK := tc.txns[txnID] if metaOK { et.IntentSpans = txnMeta.intentSpans() } tc.Unlock() if intentSpans := ba.GetIntentSpans(); len(intentSpans) > 0 { // Writes in Batch, so EndTransaction is fine. Should add // outstanding intents to EndTransaction, though. // TODO(tschottdorf): possible issues when the batch fails, // but the intents have been added anyways. // TODO(tschottdorf): some of these intents may be covered // by others, for example {[a,b), a}). This can lead to // some extra requests when those are non-local to the txn // record. But it doesn't seem worth optimizing now. et.IntentSpans = append(et.IntentSpans, intentSpans...) } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewErrorf("transaction is already committed or aborted") } if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewErrorf("cannot commit a read-only transaction") } if log.V(1) { for _, intent := range et.IntentSpans { sp.LogEvent(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(ctx, ba, br, pErr); pErr != nil { sp.LogEvent(fmt.Sprintf("error: %s", pErr)) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(sp, *br.Txn) } return br, nil }