// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { var store *Store var err error // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { var repl *roachpb.ReplicaDescriptor var rangeID roachpb.RangeID rs := keys.Range(ba) rangeID, repl, err = ls.lookupReplica(rs.Key, rs.EndKey) if err == nil { ba.RangeID = rangeID ba.Replica = *repl } } ctx = log.Add(ctx, log.RangeID, ba.RangeID) if err == nil { store, err = ls.GetStore(ba.Replica.StoreID) } if err != nil { return nil, roachpb.NewError(err) } sp, cleanupSp := tracing.SpanFromContext(opStores, store.Tracer(), ctx) defer cleanupSp() if ba.Txn != nil { // For calls that read data within a txn, we keep track of timestamps // observed from the various participating nodes' HLC clocks. If we have // a timestamp on file for this Node which is smaller than MaxTimestamp, // we can lower MaxTimestamp accordingly. If MaxTimestamp drops below // OrigTimestamp, we effectively can't see uncertainty restarts any // more. // Note that it's not an issue if MaxTimestamp propagates back out to // the client via a returned Transaction update - when updating a Txn // from another, the larger MaxTimestamp wins. if maxTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID); ok && maxTS.Less(ba.Txn.MaxTimestamp) { // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn // The uncertainty window is [OrigTimestamp, maxTS), so if that window // is empty, there won't be any uncertainty restarts. if !ba.Txn.OrigTimestamp.Less(maxTS) { sp.LogEvent("read has no clock uncertainty") } shallowTxn.MaxTimestamp.Backward(maxTS) ba.Txn = &shallowTxn } } br, pErr := store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { sp := tracing.SpanFromContext(ctx) var store *Store var pErr *roachpb.Error // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { var repl *roachpb.ReplicaDescriptor var rangeID roachpb.RangeID rs := keys.Range(ba) rangeID, repl, pErr = ls.lookupReplica(rs.Key, rs.EndKey) if pErr == nil { ba.RangeID = rangeID ba.Replica = *repl } } ctx = log.Add(ctx, log.RangeID, ba.RangeID) if pErr == nil { store, pErr = ls.GetStore(ba.Replica.StoreID) } var br *roachpb.BatchResponse if pErr != nil { return nil, pErr } // For calls that read data within a txn, we can avoid uncertainty // related retries in certain situations. If the node is in // "CertainNodes", we need not worry about uncertain reads any // more. Setting MaxTimestamp=OrigTimestamp for the operation // accomplishes that. See roachpb.Transaction.CertainNodes for details. if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) { // MaxTimestamp = Timestamp corresponds to no clock uncertainty. sp.LogEvent("read has no clock uncertainty") // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn // We set to OrigTimestamp because that works for both SNAPSHOT and // SERIALIZABLE: If we used Timestamp instead, we could run into // unnecessary retries at SNAPSHOT. For example, a SNAPSHOT txn at // OrigTimestamp = 1000.0, Timestamp = 2000.0, MaxTimestamp = 3000.0 // will always read at 1000, so a MaxTimestamp of 2000 will still let // it restart with uncertainty when it finds a value in (1000, 2000). shallowTxn.MaxTimestamp = ba.Txn.OrigTimestamp ba.Txn = &shallowTxn } br, pErr = store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { sp := tracing.SpanFromContext(ctx) var store *Store var pErr *roachpb.Error // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { var repl *roachpb.ReplicaDescriptor var rangeID roachpb.RangeID rs := keys.Range(ba) rangeID, repl, pErr = ls.lookupReplica(rs.Key, rs.EndKey) if pErr == nil { ba.RangeID = rangeID ba.Replica = *repl } } ctx = log.Add(ctx, log.RangeID, ba.RangeID) if pErr == nil { store, pErr = ls.GetStore(ba.Replica.StoreID) } var br *roachpb.BatchResponse if pErr != nil { return nil, pErr } // For calls that read data within a txn, we can avoid uncertainty // related retries in certain situations. If the node is in // "CertainNodes", we need not worry about uncertain reads any // more. Setting MaxTimestamp=Timestamp for the operation // accomplishes that. See roachpb.Transaction.CertainNodes for details. if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) { // MaxTimestamp = Timestamp corresponds to no clock uncertainty. sp.LogEvent("read has no clock uncertainty") // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn shallowTxn.MaxTimestamp = ba.Txn.Timestamp ba.Txn = &shallowTxn } br, pErr = store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState(ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error { sp, cleanupSp := tracing.SpanFromContext(opTxnCoordSender, tc.tracer, ctx) defer cleanupSp() newTxn := &roachpb.Transaction{} newTxn.Update(ba.Txn) if pErr == nil { newTxn.Update(br.Txn) } else { newTxn.Update(pErr.GetTxn()) } // If the request was successful but we're in a transaction which needs to // restart but doesn't know it yet, let it restart now (as opposed to // waiting until EndTransaction). if pErr == nil && newTxn.Isolation == roachpb.SERIALIZABLE && !newTxn.OrigTimestamp.Equal(newTxn.Timestamp) { pErr = roachpb.NewErrorWithTxn(roachpb.NewTransactionRetryError(), br.Txn) br = nil } switch t := pErr.GetDetail().(type) { case *roachpb.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(sp, *pErr.GetTxn()) case *roachpb.OpRequiresTxnError: panic("OpRequiresTxnError must not happen at this level") case *roachpb.ReadWithinUncertaintyIntervalError: // If the reader encountered a newer write within the uncertainty // interval, we advance the txn's timestamp just past the last observed // timestamp from the node. restartTS, ok := newTxn.GetObservedTimestamp(pErr.OriginNode) if !ok { pErr = roachpb.NewError(util.Errorf("no observed timestamp for node %d found on uncertainty restart", pErr.OriginNode)) } else { newTxn.Timestamp.Forward(restartTS) newTxn.Restart(ba.UserPriority, newTxn.Priority, newTxn.Timestamp) } case *roachpb.TransactionAbortedError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(pErr.GetTxn().Timestamp) newTxn.Priority = pErr.GetTxn().Priority // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(sp, *newTxn) case *roachpb.TransactionPushError: // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1)) newTxn.Restart(ba.UserPriority, t.PusheeTxn.Priority-1, newTxn.Timestamp) case *roachpb.TransactionRetryError: newTxn.Restart(ba.UserPriority, pErr.GetTxn().Priority, newTxn.Timestamp) case nil: // Nothing to do here, avoid the default case. default: if pErr.GetTxn() != nil { if pErr.CanRetry() { panic("Retryable internal error must not happen at this level") } else { // Do not clean up the transaction here since the client might still // want to continue the transaction. For example, a client might // continue its transaction after receiving ConditionFailedError, which // can come from a unique index violation. } } } if pErr != nil && pErr.GetTxn() != nil { // Avoid changing existing errors because sometimes they escape into // goroutines and then there are races. Fairly sure there isn't one // here, but better safe than sorry. pErrShallow := *pErr pErrShallow.SetTxn(newTxn) pErr = &pErrShallow } if newTxn.ID == nil { return pErr } txnID := *newTxn.ID tc.Lock() defer tc.Unlock() txnMeta := tc.txns[txnID] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation. // A tricky edge case is that of a transaction which "fails" on the // first writing request, but actually manages to write some intents // (for example, due to being multi-range). In this case, there will // be an error, but the transaction will be marked as Writing and the // coordinator must track the state, for the client's retry will be // performed with a Writing transaction which the coordinator rejects // unless it is tracking it (on top of it making sense to track it; // after all, it **has** laid down intents and only the coordinator // can augment a potential EndTransaction call). See #3303. var intents []roachpb.Span // TODO(nvanbenschoten): Iterating here to put the intents in a slice for // the sole purpose of later iterating again and calling addKeyRange is a // little wasteful and can likely be avoided. ba.IntentSpanIterate(func(key, endKey roachpb.Key) { intents = append(intents, roachpb.Span{Key: key, EndKey: endKey}) }) if len(intents) > 0 && (pErr == nil || newTxn.Writing) { if txnMeta == nil { if !newTxn.Writing { panic("txn with intents marked as non-writing") } // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. If we ended up here with an error, we'll always start // the coordinator - the transaction has laid down intents, so // we expect it to be committed/aborted at some point in the // future. if _, isEnding := ba.GetArg(roachpb.EndTransaction); pErr != nil || !isEnding { sp.LogEvent("coordinator spawns") txnMeta = &txnMetadata{ txn: *newTxn, keys: interval.NewRangeTree(), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[txnID] = txnMeta if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(txnID) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(txnID) return roachpb.NewError(&roachpb.NodeUnavailableError{}) } } } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn = *newTxn if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) // Adding the intents even on error reduces the likelihood of dangling // intents blocking concurrent writers for extended periods of time. // See #3346. for _, intent := range intents { addKeyRange(txnMeta.keys, intent.Key, intent.EndKey) } } if pErr == nil { // For successful transactional requests, always send the updated txn // record back. br.Txn = newTxn } return pErr }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { // Start new or pick up active trace and embed its trace metadata into // header for use by RPC recipients. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp, cleanupSp := tracing.SpanFromContext(opTxnCoordSender, tc.tracer, ctx) defer cleanupSp() // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.Header.Trace == nil { ba.Header.Trace = &tracing.Span{} } if err := tc.tracer.Inject(sp, basictracer.Delegator, ba.Trace); err != nil { return nil, roachpb.NewError(err) } if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } var startNS int64 ba.SetNewRequest() // This is the earliest point at which the request has an ID (if // applicable). Begin a Trace which follows this request. ctx = opentracing.ContextWithSpan(ctx, sp) if ba.Txn != nil { // If this request is part of a transaction... txnID := *ba.Txn.ID // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if ba.Txn.Writing && ba.IsTransactionWrite() { tc.Lock() _, ok := tc.txns[txnID] tc.Unlock() if !ok { return nil, roachpb.NewErrorf("transaction must not write on multiple coordinators") } } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } tc.Lock() txnMeta, metaOK := tc.txns[txnID] { // Populate et.IntentSpans, taking into account both existing // writes (if any) and new writes in this batch, and taking // care to perform proper deduplication. var keys interval.RangeGroup if metaOK { keys = txnMeta.keys } else { keys = interval.NewRangeTree() } ba.IntentSpanIterate(func(key, endKey roachpb.Key) { addKeyRange(keys, key, endKey) }) et.IntentSpans = collectIntentSpans(keys) } tc.Unlock() if len(et.IntentSpans) > 0 { // All good, proceed. } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewErrorf("transaction is already committed or aborted") } if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewErrorf("cannot commit a read-only transaction") } if log.V(1) { for _, intent := range et.IntentSpans { sp.LogEvent(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(ctx, ba, br, pErr); pErr != nil { sp.LogEvent(fmt.Sprintf("error: %s", pErr)) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(sp, *br.Txn) } return br, nil }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState(ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error { sp := tracing.SpanFromContext(ctx) newTxn := &roachpb.Transaction{} newTxn.Update(ba.Txn) // If the request was successful but we're in a transaction which needs to // restart but doesn't know it yet, let it restart now (as opposed to // waiting until EndTransaction). if pErr == nil && br.Txn != nil && br.Txn.Isolation == roachpb.SERIALIZABLE && !br.Txn.OrigTimestamp.Equal(br.Txn.Timestamp) { pErr = roachpb.NewErrorWithTxn(roachpb.NewTransactionRetryError(), br.Txn) br = nil } // TODO(bdarnell): We're writing to errors here (and where using ErrorWithIndex); // since there's no concept of ownership copy-on-write is always preferable. switch t := pErr.GetDetail().(type) { case nil: newTxn.Update(br.Txn) // Move txn timestamp forward to response timestamp if applicable. // TODO(tschottdorf): see (*Replica).executeBatch and comments within. // Looks like this isn't necessary any more, nor did it prevent a bug // referenced in a TODO there. newTxn.Timestamp.Forward(br.Timestamp) case *roachpb.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. pErrTxn := pErr.GetTxn().Clone() defer tc.cleanupTxn(sp, pErrTxn) case *roachpb.OpRequiresTxnError: panic("OpRequiresTxnError must not happen at this level") case *roachpb.ReadWithinUncertaintyIntervalError: // Mark the host as certain. See the protobuf comment for // Transaction.CertainNodes for details. if t.NodeID == 0 { panic("no replica set in header on uncertainty restart") } newTxn.Update(pErr.GetTxn()) newTxn.CertainNodes.Add(t.NodeID) // If the reader encountered a newer write within the uncertainty // interval, move the timestamp forward, just past that write or // up to MaxTimestamp, whichever comes first. candidateTS := newTxn.MaxTimestamp candidateTS.Backward(t.ExistingTimestamp.Add(0, 1)) newTxn.Timestamp.Forward(candidateTS) newTxn.Restart(ba.UserPriority, newTxn.Priority, newTxn.Timestamp) pErr.SetTxn(newTxn) case *roachpb.TransactionAbortedError: newTxn.Update(pErr.GetTxn()) // Increase timestamp if applicable. newTxn.Timestamp.Forward(pErr.GetTxn().Timestamp) newTxn.Priority = pErr.GetTxn().Priority pErr.SetTxn(newTxn) // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(sp, *pErr.GetTxn()) case *roachpb.TransactionPushError: newTxn.Update(pErr.GetTxn()) // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1)) newTxn.Restart(ba.UserPriority, t.PusheeTxn.Priority-1, newTxn.Timestamp) pErr.SetTxn(newTxn) case *roachpb.TransactionRetryError: newTxn.Update(pErr.GetTxn()) newTxn.Restart(ba.UserPriority, pErr.GetTxn().Priority, newTxn.Timestamp) pErr.SetTxn(newTxn) } if newTxn.ID == nil { return pErr } txnID := *newTxn.ID tc.Lock() defer tc.Unlock() txnMeta := tc.txns[txnID] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation. // A tricky edge case is that of a transaction which "fails" on the // first writing request, but actually manages to write some intents // (for example, due to being multi-range). In this case, there will // be an error, but the transaction will be marked as Writing and the // coordinator must track the state, for the client's retry will be // performed with a Writing transaction which the coordinator rejects // unless it is tracking it (on top of it making sense to track it; // after all, it **has** laid down intents and only the coordinator // can augment a potential EndTransaction call). See #3303. intents := ba.GetIntentSpans() if len(intents) > 0 && (pErr == nil || newTxn.Writing) { if txnMeta == nil { if !newTxn.Writing { panic("txn with intents marked as non-writing") } // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. If we ended up here with an error, we'll always start // the coordinator - the transaction has laid down intents, so // we expect it to be committed/aborted at some point in the // future. if _, isEnding := ba.GetArg(roachpb.EndTransaction); pErr != nil || !isEnding { sp.LogEvent("coordinator spawns") txnMeta = &txnMetadata{ txn: *newTxn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[txnID] = txnMeta if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(txnID) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(txnID) return roachpb.NewError(&roachpb.NodeUnavailableError{}) } } } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn = *newTxn if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) // Adding the intents even on error reduces the likelihood of dangling // intents blocking concurrent writers for extended periods of time. // See #3346. for _, intent := range intents { txnMeta.addKeyRange(intent.Key, intent.EndKey) } } if pErr == nil { // For successful transactional requests, always send the updated txn // record back. br.Txn = newTxn } return pErr }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() sp := tracing.SpanFromContext(ctx) // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs := keys.Range(ba) var br *roachpb.BatchResponse // Send the request to one range per iteration. for { considerIntents := false var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var needAnother bool var pErr *roachpb.Error var finished bool for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. sp.LogEvent("meta descriptor lookup") var evictDesc func() desc, needAnother, evictDesc, pErr = ds.getDescriptors(rs, considerIntents, isReverse) // getDescriptors may fail retryably if the first range isn't // available via Gossip. if pErr != nil { if pErr.Retryable { if log.V(1) { log.Warning(pErr) } continue } break } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. if (isReverse && !desc.ContainsKeyRange(desc.StartKey, rs.EndKey)) || (!isReverse && !desc.ContainsKeyRange(rs.Key, desc.EndKey)) { evictDesc() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(sp, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } sp.LogEvent(fmt.Sprintf("reply error: %T", pErr.GetDetail())) // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. // TODO(tschottdorf): If a replica group goes dead, this // will cause clients to put high read pressure on the first // range, so there should be some rate limiting here. evictDesc() if tErr.CanRetry() { continue } case *roachpb.RangeNotFoundError, *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. evictDesc() // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } // On retries, allow [uncommitted] intents on range descriptor // lookups to be returned 50% of the time in order to succeed // at finding the transaction record pointed to by the intent // itself. The 50% probability of returning either the current // intent or the previously committed value balances between // the two cases where the intent's txn hasn't yet been // committed (the previous value is correct), or the intent's // txn has been committed (the intent value is correct). considerIntents = true continue case *roachpb.NotLeaderError: newLeader := tErr.Leader if newLeader != nil { // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale range descriptor; // evict cache. if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } evictDesc() } } else { // If the new leader is unknown, we were talking to a // replica that is partitioned away from the majority. Our // range descriptor may be stale, so clear the cache. // // TODO(bdarnell): An unknown-leader error doesn't // necessarily mean our descriptor is stale. Ideally we // would treat these errors more like SendError: retry on // another node (at a lower level), and then if it reaches // this level then we know we've exhausted our options and // must clear the cache. evictDesc() newLeader = &roachpb.ReplicaDescriptor{} } // Next, cache the new leader. ds.updateLeaderCache(roachpb.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(tErr) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(tErr) } continue } } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false default: log.Fatal("exited retry loop with nil error but finished=false") } } ba.Txn.Update(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if br == nil { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*roachpb.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(roachpb.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). ba.Requests[i].Reset() // necessary (no one-of?) if !ba.Requests[i].SetValue(&roachpb.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key = next(ba, desc.EndKey) } sp.LogEvent("querying next range") } }
// resolveIntents resolves the given intents. For those which are // local to the range, we submit directly to the local Raft instance; // all non-local intents are resolved asynchronously in a batch. If // `wait` is true, all operations are carried out synchronously and an // error is returned. Otherwise, the call returns without error as // soon as all local resolve commands have been **proposed** (not // executed). This ensures that if a waiting client retries // immediately after calling this function, it will not hit the same // intents again. func (ir *intentResolver) resolveIntents(ctx context.Context, r *Replica, intents []roachpb.Intent, wait bool, poison bool) *roachpb.Error { sp, cleanupSp := tracing.SpanFromContext(opReplica, ir.store.Tracer(), ctx) defer cleanupSp() // We're doing async stuff below; those need new traces. ctx = opentracing.ContextWithSpan(ctx, nil) sp.LogEvent(fmt.Sprintf("resolving intents [wait=%t]", wait)) var reqsRemote []roachpb.Request baLocal := roachpb.BatchRequest{} for i := range intents { intent := intents[i] // avoids a race in `i, intent := range ...` var resolveArgs roachpb.Request var local bool // whether this intent lives on this Range { if len(intent.EndKey) == 0 { resolveArgs = &roachpb.ResolveIntentRequest{ Span: intent.Span, IntentTxn: intent.Txn, Status: intent.Status, Poison: poison, } local = r.ContainsKey(intent.Key) } else { resolveArgs = &roachpb.ResolveIntentRangeRequest{ Span: intent.Span, IntentTxn: intent.Txn, Status: intent.Status, Poison: poison, } local = r.ContainsKeyRange(intent.Key, intent.EndKey) } } // If the intent isn't (completely) local, we'll need to send an external request. // We'll batch them all up and send at the end. if local { baLocal.Add(resolveArgs) } else { reqsRemote = append(reqsRemote, resolveArgs) } } // The local batch goes directly to Raft. var wg sync.WaitGroup if len(baLocal.Requests) > 0 { action := func() *roachpb.Error { // Trace this under the ID of the intent owner. sp := r.store.Tracer().StartSpan("resolve intents") defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) // Always operate with a timeout when resolving intents: this // prevents rare shutdown timeouts in tests. ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() _, pErr := r.addWriteCmd(ctxWithTimeout, baLocal, &wg) return pErr } wg.Add(1) if wait || !r.store.Stopper().RunLimitedAsyncTask(ir.sem, func() { if err := action(); err != nil { log.Warningf("unable to resolve local intents; %s", err) } }) { // Still run the task when draining. Our caller already has a task and // going async here again is merely for performance, but some intents // need to be resolved because they might block other tasks. See #1684. // Note that handleSkippedIntents has a TODO in case #1684 comes back. if err := action(); err != nil { return err } } } // Resolve all of the intents which aren't local to the Range. if len(reqsRemote) > 0 { b := &client.Batch{} b.InternalAddRequest(reqsRemote...) action := func() *roachpb.Error { // TODO(tschottdorf): no tracing here yet. return r.store.DB().Run(b) } if wait || !r.store.Stopper().RunLimitedAsyncTask(ir.sem, func() { if err := action(); err != nil { log.Warningf("unable to resolve external intents: %s", err) } }) { // As with local intents, try async to not keep the caller waiting, but // when draining just go ahead and do it synchronously. See #1684. if err := action(); err != nil { return err } } } // Wait until the local ResolveIntents batch has been submitted to // raft. No-op if all were non-local. wg.Wait() return nil }
// maybePushTransaction tries to push the conflicting transaction(s) // responsible for the given intents: either move its // timestamp forward on a read/write conflict, abort it on a // write/write conflict, or do nothing if the transaction is no longer // pending. // // Returns a slice of intents which can now be resolved, and an error. // The returned intents should be resolved via // intentResolver.resolveIntents regardless of any error returned by // maybePushTransaction, but if the error is non-nil then some of the // conflicting transactions may still be pending. // // If skipIfInFlight is true, then no PushTxns will be sent and no // intents will be returned for any transaction for which there is // another push in progress. This should only be used by callers who // are not relying on the side effect of a push (i.e. only // pushType==PUSH_TOUCH), and who also don't need to synchronize with // the resolution of those intents (e.g. asynchronous resolutions of // intents skipped on inconsistent reads). // // Callers are involved with // a) conflict resolution for commands being executed at the Store with the // client waiting, // b) resolving intents encountered during inconsistent operations, and // c) resolving intents upon EndTransaction which are not local to the given // range. This is the only path in which the transaction is going to be // in non-pending state and doesn't require a push. func (ir *intentResolver) maybePushTransactions(ctx context.Context, intents []roachpb.Intent, h roachpb.Header, pushType roachpb.PushTxnType, skipIfInFlight bool) ( []roachpb.Intent, *roachpb.Error) { now := ir.store.Clock().Now() pusherTxn := h.Txn // If there's no pusher, we communicate a priority by sending an empty // txn with only the priority set. if pusherTxn == nil { pusherTxn = &roachpb.Transaction{ Priority: roachpb.MakePriority(h.UserPriority), } } sp, cleanupSp := tracing.SpanFromContext(opStore, ir.store.Tracer(), ctx) defer cleanupSp() sp.LogEvent("intent resolution") // Split intents into those we need to push and those which are good to // resolve. ir.mu.Lock() // TODO(tschottdorf): can optimize this and use same underlying slice. var pushIntents, resolveIntents []roachpb.Intent for _, intent := range intents { if intent.Status != roachpb.PENDING { // The current intent does not need conflict resolution // because the transaction is already finalized. // TODO(bdarnell): can this happen any more? resolveIntents = append(resolveIntents, intent) } else if _, ok := ir.mu.inFlight[*intent.Txn.ID]; ok && skipIfInFlight { // Another goroutine is working on this transaction so we can // skip it. if log.V(1) { log.Infof("skipping PushTxn for %s; attempt already in flight", intent.Txn.ID) } continue } else { pushIntents = append(pushIntents, intent) ir.mu.inFlight[*intent.Txn.ID]++ } } ir.mu.Unlock() // Attempt to push the transaction(s) which created the conflicting intent(s). var pushReqs []roachpb.Request for _, intent := range pushIntents { pushReqs = append(pushReqs, &roachpb.PushTxnRequest{ Span: roachpb.Span{ Key: intent.Txn.Key, }, PusherTxn: *pusherTxn, PusheeTxn: intent.Txn, PushTo: h.Timestamp, // The timestamp is used by PushTxn for figuring out whether the // transaction is abandoned. If we used the argument's timestamp // here, we would run into busy loops because that timestamp // usually stays fixed among retries, so it will never realize // that a transaction has timed out. See #877. Now: now, PushType: pushType, }) } // TODO(kaneda): Set the transaction in the header so that the // txn is correctly propagated in an error response. b := &client.Batch{} b.InternalAddRequest(pushReqs...) br, err := ir.store.db.RunWithResponse(b) ir.mu.Lock() for _, intent := range pushIntents { ir.mu.inFlight[*intent.Txn.ID]-- if ir.mu.inFlight[*intent.Txn.ID] == 0 { delete(ir.mu.inFlight, *intent.Txn.ID) } } ir.mu.Unlock() if err != nil { // TODO(bdarnell): return resolveIntents even on error. return nil, err } for i, intent := range pushIntents { pushee := br.Responses[i].GetInner().(*roachpb.PushTxnResponse).PusheeTxn intent.Txn = pushee.TxnMeta intent.Status = pushee.Status resolveIntents = append(resolveIntents, intent) } return resolveIntents, nil }