// TestTxnCoordSenderSingleRoundtripTxn checks that a batch which completely // holds the writing portion of a Txn (including EndTransaction) does not // launch a heartbeat goroutine at all. func TestTxnCoordSenderSingleRoundtripTxn(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() manual := hlc.NewManualClock(123) clock := hlc.NewClock(manual.UnixNano, 20*time.Nanosecond) senderFunc := func(_ context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { br := ba.CreateReply() txnClone := ba.Txn.Clone() br.Txn = &txnClone br.Txn.Writing = true return br, nil } ambient := log.AmbientContext{Tracer: tracing.NewTracer()} ts := NewTxnCoordSender( ambient, senderFn(senderFunc), clock, false, stopper, MakeTxnMetrics(metric.TestSampleInterval), ) // Stop the stopper manually, prior to trying the transaction. This has the // effect of returning a NodeUnavailableError for any attempts at launching // a heartbeat goroutine. stopper.Stop() var ba roachpb.BatchRequest key := roachpb.Key("test") ba.Add(&roachpb.BeginTransactionRequest{Span: roachpb.Span{Key: key}}) ba.Add(&roachpb.PutRequest{Span: roachpb.Span{Key: key}}) ba.Add(&roachpb.EndTransactionRequest{}) ba.Txn = &roachpb.Transaction{Name: "test"} _, pErr := ts.Send(context.Background(), ba) if pErr != nil { t.Fatal(pErr) } }
// TestTxnCoordSenderErrorWithIntent validates that if a transactional request // returns an error but also indicates a Writing transaction, the coordinator // tracks it just like a successful request. func TestTxnCoordSenderErrorWithIntent(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() manual := hlc.NewManualClock(0) clock := hlc.NewClock(manual.UnixNano) clock.SetMaxOffset(20) testCases := []struct { roachpb.Error errMsg string }{ {*roachpb.NewError(roachpb.NewTransactionRetryError()), "retry txn"}, {*roachpb.NewError(roachpb.NewTransactionPushError(roachpb.Transaction{ TxnMeta: enginepb.TxnMeta{ ID: uuid.NewV4(), }})), "failed to push"}, {*roachpb.NewErrorf("testError"), "testError"}, } for i, test := range testCases { func() { senderFunc := func(_ context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { txn := ba.Txn.Clone() txn.Writing = true pErr := &roachpb.Error{} *pErr = test.Error pErr.SetTxn(&txn) return nil, pErr } ambient := log.AmbientContext{Tracer: tracing.NewTracer()} ts := NewTxnCoordSender( ambient, senderFn(senderFunc), clock, false, stopper, MakeTxnMetrics(metric.TestSampleInterval), ) var ba roachpb.BatchRequest key := roachpb.Key("test") ba.Add(&roachpb.BeginTransactionRequest{Span: roachpb.Span{Key: key}}) ba.Add(&roachpb.PutRequest{Span: roachpb.Span{Key: key}}) ba.Add(&roachpb.EndTransactionRequest{}) ba.Txn = &roachpb.Transaction{Name: "test"} _, pErr := ts.Send(context.Background(), ba) if !testutils.IsPError(pErr, test.errMsg) { t.Errorf("%d: error did not match %s: %v", i, test.errMsg, pErr) } defer teardownHeartbeats(ts) ts.Lock() defer ts.Unlock() if len(ts.txns) != 1 { t.Errorf("%d: expected transaction to be tracked", i) } }() } }
func TestBatchRequestString(t *testing.T) { br := roachpb.BatchRequest{} br.Txn = new(roachpb.Transaction) for i := 0; i < 100; i++ { br.Requests = append(br.Requests, roachpb.RequestUnion{Get: &roachpb.GetRequest{}}) } br.Requests = append(br.Requests, roachpb.RequestUnion{EndTransaction: &roachpb.EndTransactionRequest{}}) e := `[txn: <nil>], Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), ... 76 skipped ..., Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), EndTransaction [/Min,/Min)` if e != br.String() { t.Fatalf("e = %s, v = %s", e, br.String()) } }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send( ctx context.Context, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, *roachpb.Error) { // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err) } rangeID, repDesc, err := ls.LookupReplica(rs.Key, rs.EndKey) if err != nil { return nil, roachpb.NewError(err) } ba.RangeID = rangeID ba.Replica = repDesc } store, err := ls.GetStore(ba.Replica.StoreID) if err != nil { return nil, roachpb.NewError(err) } if ba.Txn != nil { // For calls that read data within a txn, we keep track of timestamps // observed from the various participating nodes' HLC clocks. If we have // a timestamp on file for this Node which is smaller than MaxTimestamp, // we can lower MaxTimestamp accordingly. If MaxTimestamp drops below // OrigTimestamp, we effectively can't see uncertainty restarts any // more. // Note that it's not an issue if MaxTimestamp propagates back out to // the client via a returned Transaction update - when updating a Txn // from another, the larger MaxTimestamp wins. if maxTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID); ok && maxTS.Less(ba.Txn.MaxTimestamp) { // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn // The uncertainty window is [OrigTimestamp, maxTS), so if that window // is empty, there won't be any uncertainty restarts. if !ba.Txn.OrigTimestamp.Less(maxTS) { log.Event(ctx, "read has no clock uncertainty") } shallowTxn.MaxTimestamp.Backward(maxTS) ba.Txn = &shallowTxn } } br, pErr := store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// maybeBeginTxn begins a new transaction if a txn has been specified // in the request but has a nil ID. The new transaction is initialized // using the name and isolation in the otherwise uninitialized txn. // The Priority, if non-zero is used as a minimum. // // No transactional writes are allowed unless preceded by a begin // transaction request within the same batch. The exception is if the // transaction is already in state txn.Writing=true. func (tc *TxnCoordSender) maybeBeginTxn(ba *roachpb.BatchRequest) error { if len(ba.Requests) == 0 { return errors.Errorf("empty batch with txn") } if ba.Txn.ID == nil { // Create transaction without a key. The key is set when a begin // transaction request is received. // The initial timestamp may be communicated by a higher layer. // If so, use that. Otherwise make up a new one. timestamp := ba.Txn.OrigTimestamp if timestamp == hlc.ZeroTimestamp { timestamp = tc.clock.Now() } newTxn := roachpb.NewTransaction(ba.Txn.Name, nil, ba.UserPriority, ba.Txn.Isolation, timestamp, tc.clock.MaxOffset().Nanoseconds()) // Use existing priority as a minimum. This is used on transaction // aborts to ratchet priority when creating successor transaction. if newTxn.Priority < ba.Txn.Priority { newTxn.Priority = ba.Txn.Priority } ba.Txn = newTxn } // Check for a begin transaction to set txn key based on the key of // the first transactional write. Also enforce that no transactional // writes occur before a begin transaction. var haveBeginTxn bool for _, req := range ba.Requests { args := req.GetInner() if bt, ok := args.(*roachpb.BeginTransactionRequest); ok { if haveBeginTxn || ba.Txn.Writing { return errors.Errorf("begin transaction requested twice in the same transaction: %s", ba.Txn) } haveBeginTxn = true if ba.Txn.Key == nil { ba.Txn.Key = bt.Key } } if roachpb.IsTransactionWrite(args) && !haveBeginTxn && !ba.Txn.Writing { return errors.Errorf("transactional write before begin transaction") } } return nil }
// tryAsyncAbort (synchronously) grabs a copy of the txn proto and the intents // (which it then clears from txnMeta), and asynchronously tries to abort the // transaction. func (tc *TxnCoordSender) tryAsyncAbort(txnID uuid.UUID) { tc.Lock() txnMeta := tc.txns[txnID] // Clone the intents and the txn to avoid data races. intentSpans, _ := roachpb.MergeSpans(append([]roachpb.Span(nil), txnMeta.keys...)) txnMeta.keys = nil txn := txnMeta.txn.Clone() tc.Unlock() // Since we don't hold the lock continuously, it's possible that two aborts // raced here. That's fine (and probably better than the alternative, which // is missing new intents sometimes). if txn.Status != roachpb.PENDING { return } ba := roachpb.BatchRequest{} ba.Txn = &txn et := &roachpb.EndTransactionRequest{ Span: roachpb.Span{ Key: txn.Key, }, Commit: false, IntentSpans: intentSpans, } ba.Add(et) ctx := tc.AnnotateCtx(context.TODO()) if err := tc.stopper.RunAsyncTask(ctx, func(ctx context.Context) { // Use the wrapped sender since the normal Sender does not allow // clients to specify intents. if _, pErr := tc.wrapped.Send(ctx, ba); pErr != nil { if log.V(1) { log.Warningf(ctx, "abort due to inactivity failed for %s: %s ", txn, pErr) } } }); err != nil { log.Warning(ctx, err) } }
// TestTxnCoordSenderHeartbeat verifies periodic heartbeat of the // transaction record. func TestTxnCoordSenderHeartbeat(t *testing.T) { defer leaktest.AfterTest(t)() s, sender := createTestDB(t) defer s.Stop() defer teardownHeartbeats(sender) // Set heartbeat interval to 1ms for testing. sender.heartbeatInterval = 1 * time.Millisecond initialTxn := client.NewTxn(context.Background(), *s.DB) if err := initialTxn.Put(roachpb.Key("a"), []byte("value")); err != nil { t.Fatal(err) } // Verify 3 heartbeats. var heartbeatTS hlc.Timestamp for i := 0; i < 3; i++ { util.SucceedsSoon(t, func() error { txn, pErr := getTxn(sender, &initialTxn.Proto) if pErr != nil { t.Fatal(pErr) } // Advance clock by 1ns. // Locking the TxnCoordSender to prevent a data race. sender.Lock() s.Manual.Increment(1) sender.Unlock() if txn.LastHeartbeat != nil && heartbeatTS.Less(*txn.LastHeartbeat) { heartbeatTS = *txn.LastHeartbeat return nil } return errors.Errorf("expected heartbeat") }) } // Sneakily send an ABORT right to DistSender (bypassing TxnCoordSender). { var ba roachpb.BatchRequest ba.Add(&roachpb.EndTransactionRequest{ Commit: false, Span: roachpb.Span{Key: initialTxn.Proto.Key}, }) ba.Txn = &initialTxn.Proto if _, pErr := sender.wrapped.Send(context.Background(), ba); pErr != nil { t.Fatal(pErr) } } util.SucceedsSoon(t, func() error { sender.Lock() defer sender.Unlock() if txnMeta, ok := sender.txns[*initialTxn.Proto.ID]; !ok { t.Fatal("transaction unregistered prematurely") } else if txnMeta.txn.Status != roachpb.ABORTED { return fmt.Errorf("transaction is not aborted") } return nil }) // Trying to do something else should give us a TransactionAbortedError. _, err := initialTxn.Get("a") assertTransactionAbortedError(t, err) }
// divideAndSendBatchToRanges sends the supplied batch to all of the // ranges which comprise the span specified by rs. The batch request // is trimmed against each range which is part of the span and sent // either serially or in parallel, if possible. isFirst indicates // whether this is the first time this method has been called on the // batch. It's specified false where this method is invoked recursively. func (ds *DistSender) divideAndSendBatchToRanges( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, isFirst bool, ) (br *roachpb.BatchResponse, pErr *roachpb.Error) { // This function builds a channel of responses for each range // implicated in the span (rs) and combines them into a single // BatchResponse when finished. var responseChs []chan response defer func() { for _, responseCh := range responseChs { resp := <-responseCh if resp.pErr != nil { if pErr == nil { pErr = resp.pErr } continue } if br == nil { // First response from a Range. br = resp.reply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(resp.reply); err != nil { pErr = roachpb.NewError(err) return } br.Txn.Update(resp.reply.Txn) } } // If we experienced an error, don't neglect to update the error's // attached transaction with any responses which were received. if pErr != nil { if br != nil { pErr.UpdateTxn(br.Txn) } } }() // Get initial seek key depending on direction of iteration. var seekKey roachpb.RKey isReverse := ba.IsReverse() if isReverse { seekKey = rs.EndKey } else { seekKey = rs.Key } // Send the request to one range per iteration. ri := NewRangeIterator(ds, isReverse) for ri.Seek(ctx, seekKey); ri.Valid(); ri.Seek(ctx, seekKey) { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to // for each RPC individually). On RPC errors, there's no guarantee // that the request hasn't made its way to the target regardless // of the error; we'd like the second execution to be caught by // the sequence cache if that happens. There is a small chance // that we address a range twice in this chunk (stale/suboptimal // descriptors due to splits/merges) which leads to a transaction // retry. // // TODO(tschottdorf): it's possible that if we don't evict from // the cache we could be in for a busy loop. ba.SetNewRequest() responseCh := make(chan response, 1) responseChs = append(responseChs, responseCh) if isFirst && ri.NeedAnother(rs) { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { responseCh <- response{pErr: roachpb.NewError(&roachpb.OpRequiresTxnError{})} return } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { responseCh <- response{pErr: errNo1PCTxn} return } } // Determine next seek key, taking a potentially sparse batch into // consideration. var err error nextRS := rs if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. seekKey, err = prev(ba, ri.Desc().StartKey) nextRS.EndKey = seekKey } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. seekKey, err = next(ba, ri.Desc().EndKey) nextRS.Key = seekKey } if err != nil { responseCh <- response{pErr: roachpb.NewError(err)} return } // Send the next partial batch to the first range in the "rs" span. // If we're not handling a request which limits responses and we // can reserve one of the limited goroutines available for parallel // batch RPCs, send asynchronously. if ba.MaxSpanRequestKeys == 0 && ri.NeedAnother(rs) && ds.rpcContext != nil && ds.sendPartialBatchAsync(ctx, ba, rs, ri.Desc(), ri.Token(), isFirst, responseCh) { // Note that we pass the batch request by value to the parallel // goroutine to avoid using the cloned txn. // Clone the txn to preserve the current txn sequence for the async call. if ba.Txn != nil { txnClone := ba.Txn.Clone() ba.Txn = &txnClone } } else { // Send synchronously if there is no parallel capacity left, there's a // max results limit, or this is the final request in the span. resp := ds.sendPartialBatch(ctx, ba, rs, ri.Desc(), ri.Token(), isFirst) responseCh <- resp if resp.pErr != nil { return } ba.UpdateTxn(resp.reply.Txn) // Check whether we've received enough responses to exit query loop. if ba.MaxSpanRequestKeys > 0 { var numResults int64 for _, r := range resp.reply.Responses { numResults += r.GetInner().Header().NumKeys } if numResults > ba.MaxSpanRequestKeys { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxSpanRequestKeys)) } ba.MaxSpanRequestKeys -= numResults // Exiting; fill in missing responses. if ba.MaxSpanRequestKeys == 0 { fillSkippedResponses(ba, resp.reply, seekKey) return } } } // Check for completion. if !ri.NeedAnother(rs) { return } isFirst = false // next range will not be first! rs = nextRS } // We've exited early. Return the range iterator error. responseCh := make(chan response, 1) responseCh <- response{pErr: ri.Error()} responseChs = append(responseChs, responseCh) return }
// initAndVerifyBatch initializes timestamp-related information and // verifies batch constraints before splitting. func (ds *DistSender) initAndVerifyBatch( ctx context.Context, ba *roachpb.BatchRequest, ) *roachpb.Error { // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(hlc.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil { // Make a copy here since the code below modifies it in different places. // TODO(tschottdorf): be smarter about this - no need to do it for // requests that don't get split. txnClone := ba.Txn.Clone() ba.Txn = &txnClone if len(ba.Txn.ObservedTimestamps) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). // // Taking care below to not modify any memory referenced from // our BatchRequest which may be shared with others. // // We already have a clone of our txn (see above), so we can // modify it freely. // // Zero the existing data. That makes sure that if we had // something of size zero but with capacity, we don't re-use the // existing space (which others may also use). This is just to // satisfy paranoia/OCD and not expected to matter in practice. ba.Txn.ResetObservedTimestamps() // OrigTimestamp is the HLC timestamp at which the Txn started, so // this effectively means no more uncertainty on this node. ba.Txn.UpdateObservedTimestamp(nDesc.NodeID, ba.Txn.OrigTimestamp) } } } if len(ba.Requests) < 1 { return roachpb.NewErrorf("empty batch") } if ba.MaxSpanRequestKeys != 0 { // Verify that the batch contains only specific range requests or the // Begin/EndTransactionRequest. Verify that a batch with a ReverseScan // only contains ReverseScan range requests. isReverse := ba.IsReverse() for _, req := range ba.Requests { inner := req.GetInner() switch inner.(type) { case *roachpb.ScanRequest, *roachpb.DeleteRangeRequest: // Accepted range requests. All other range requests are still // not supported. // TODO(vivek): don't enumerate all range requests. if isReverse { return roachpb.NewErrorf("batch with limit contains both forward and reverse scans") } case *roachpb.BeginTransactionRequest, *roachpb.EndTransactionRequest, *roachpb.ReverseScanRequest: continue default: return roachpb.NewErrorf("batch with limit contains %T request", inner) } } } return nil }
// sendInternal sends the batch and updates the transaction on error. Depending // on the error type, the transaction might be replaced by a new one. func (txn *Txn) sendInternal(ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if len(ba.Requests) == 0 { return nil, nil } if pErr := txn.db.prepareToSend(&ba); pErr != nil { return nil, pErr } // Send call through the DB's sender. ba.Txn = &txn.Proto // For testing purposes, txn.UserPriority can be a negative value (see // MakePriority). if txn.UserPriority != 0 { ba.UserPriority = txn.UserPriority } // TODO(radu): when db.send supports a context, we can just use that (and // remove the prepareToSend call above). br, pErr := txn.db.sender.Send(txn.Context, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(txn.db.sender, br)) } if br != nil { for _, encSp := range br.CollectedSpans { var newSp basictracer.RawSpan if err := tracing.DecodeRawSpan(encSp, &newSp); err != nil { return nil, roachpb.NewError(err) } txn.CollectedSpans = append(txn.CollectedSpans, newSp) } } // Only successful requests can carry an updated Txn in their response // header. Any error (e.g. a restart) can have a Txn attached to them as // well; those update our local state in the same way for the next attempt. // The exception is if our transaction was aborted and needs to restart // from scratch, in which case we do just that. if pErr == nil { txn.Proto.Update(br.Txn) return br, nil } if log.V(1) { log.Infof(txn.Context, "failed batch: %s", pErr) } if _, ok := pErr.GetDetail().(*roachpb.TransactionAbortedError); ok { // On Abort, reset the transaction so we start anew on restart. txn.Proto = roachpb.Transaction{ TxnMeta: enginepb.TxnMeta{ Isolation: txn.Proto.Isolation, }, Name: txn.Proto.Name, } // Acts as a minimum priority on restart. if pErr.GetTxn() != nil { txn.Proto.Priority = pErr.GetTxn().Priority } } else if pErr.TransactionRestart != roachpb.TransactionRestart_NONE { txn.Proto.Update(pErr.GetTxn()) } return nil, pErr }
func (tc *TxnCoordSender) heartbeat(ctx context.Context, txnID uuid.UUID) bool { tc.Lock() txnMeta := tc.txns[txnID] txn := txnMeta.txn.Clone() hasAbandoned := txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) tc.Unlock() if txn.Status != roachpb.PENDING { // A previous iteration has already determined that the transaction is // already finalized, so we wait for the client to realize that and // want to keep our state for the time being (to dish out the right // error once it returns). return true } // Before we send a heartbeat, determine whether this transaction should be // considered abandoned. If so, exit heartbeat. If ctx.Done() is not nil, then // it is a cancellable Context and we skip this check and use the ctx lifetime // instead of a timeout. // // TODO(andrei): We should disallow non-cancellable contexts in the heartbeat // goroutine and enforce that our kv client cancels the context when it's // done. We get non-cancellable contexts from remote clients // (roachpb.ExternalClient) because we override the gRPC context to make it // non-cancellable in DBServer.Batch (as that context is not tied to a txn // lifetime). // Further note that, unfortunately, the Sender interface generally makes it // difficult for the TxnCoordSender to get a context with the same lifetime as // the transaction (the TxnCoordSender associates the context of the txn's // first write with the txn). We should move to using only use local clients // (i.e. merge, or at least co-locate client.Txn and the TxnCoordSender). At // that point, we probably don't even need to deal with context cancellation // any more; the client will be trusted to always send an EndRequest when it's // done with a transaction. if ctx.Done() == nil && hasAbandoned { if log.V(1) { log.Infof(ctx, "transaction %s abandoned; stopping heartbeat", txnMeta.txn) } tc.tryAsyncAbort(txnID) return false } ba := roachpb.BatchRequest{} ba.Txn = &txn hb := &roachpb.HeartbeatTxnRequest{ Now: tc.clock.Now(), } hb.Key = txn.Key ba.Add(hb) log.Event(ctx, "heartbeat") br, pErr := tc.wrapped.Send(ctx, ba) // Correctness mandates that when we can't heartbeat the transaction, we // make sure the client doesn't keep going. This is particularly relevant // in the case of an ABORTED transaction, but if we can't reach the // transaction record at all, we're going to have to assume we're aborted // as well. if pErr != nil { log.Warningf(ctx, "heartbeat to %s failed: %s", txn, pErr) // We're not going to let the client carry out additional requests, so // try to clean up. tc.tryAsyncAbort(*txn.ID) txn.Status = roachpb.ABORTED } else { txn.Update(br.Responses[0].GetInner().(*roachpb.HeartbeatTxnResponse).Txn) } // Give the news to the txn in the txns map. This will update long-running // transactions (which may find out that they have to restart in that way), // but in particular makes sure that they notice when they've been aborted // (in which case we'll give them an error on their next request). tc.Lock() tc.txns[txnID].txn.Update(&txn) tc.Unlock() return true }
func (tc *TxnCoordSender) heartbeat(ctx context.Context, txnID uuid.UUID) bool { tc.Lock() txnMeta := tc.txns[txnID] txn := txnMeta.txn.Clone() hasAbandoned := txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) tc.Unlock() if txn.Status != roachpb.PENDING { // A previous iteration has already determined that the transaction is // already finalized, so we wait for the client to realize that and // want to keep our state for the time being (to dish out the right // error once it returns). return true } // Before we send a heartbeat, determine whether this transaction should be // considered abandoned. If so, exit heartbeat. If ctx.Done() is not nil, then // it is a cancelable Context and we skip this check and use the ctx lifetime // instead of a timeout. if ctx.Done() == nil && hasAbandoned { if log.V(1) { log.Infof(ctx, "transaction %s abandoned; stopping heartbeat", txnMeta.txn) } tc.tryAsyncAbort(txnID) return false } ba := roachpb.BatchRequest{} ba.Txn = &txn hb := &roachpb.HeartbeatTxnRequest{ Now: tc.clock.Now(), } hb.Key = txn.Key ba.Add(hb) log.Event(ctx, "heartbeat") br, pErr := tc.wrapped.Send(ctx, ba) // Correctness mandates that when we can't heartbeat the transaction, we // make sure the client doesn't keep going. This is particularly relevant // in the case of an ABORTED transaction, but if we can't reach the // transaction record at all, we're going to have to assume we're aborted // as well. if pErr != nil { log.Warningf(ctx, "heartbeat to %s failed: %s", txn, pErr) // We're not going to let the client carry out additional requests, so // try to clean up. tc.tryAsyncAbort(*txn.ID) txn.Status = roachpb.ABORTED } else { txn.Update(br.Responses[0].GetInner().(*roachpb.HeartbeatTxnResponse).Txn) } // Give the news to the txn in the txns map. This will update long-running // transactions (which may find out that they have to restart in that way), // but in particular makes sure that they notice when they've been aborted // (in which case we'll give them an error on their next request). tc.Lock() tc.txns[txnID].txn.Update(&txn) tc.Unlock() return true }