// divideAndSendBatchToRanges sends the supplied batch to all of the // ranges which comprise the span specified by rs. The batch request // is trimmed against each range which is part of the span and sent // either serially or in parallel, if possible. isFirst indicates // whether this is the first time this method has been called on the // batch. It's specified false where this method is invoked recursively. func (ds *DistSender) divideAndSendBatchToRanges( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, isFirst bool, ) (br *roachpb.BatchResponse, pErr *roachpb.Error) { // This function builds a channel of responses for each range // implicated in the span (rs) and combines them into a single // BatchResponse when finished. var responseChs []chan response defer func() { for _, responseCh := range responseChs { resp := <-responseCh if resp.pErr != nil { if pErr == nil { pErr = resp.pErr } continue } if br == nil { // First response from a Range. br = resp.reply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(resp.reply); err != nil { pErr = roachpb.NewError(err) return } br.Txn.Update(resp.reply.Txn) } } // If we experienced an error, don't neglect to update the error's // attached transaction with any responses which were received. if pErr != nil { if br != nil { pErr.UpdateTxn(br.Txn) } } }() // Get initial seek key depending on direction of iteration. var seekKey roachpb.RKey isReverse := ba.IsReverse() if isReverse { seekKey = rs.EndKey } else { seekKey = rs.Key } // Send the request to one range per iteration. ri := NewRangeIterator(ds, isReverse) for ri.Seek(ctx, seekKey); ri.Valid(); ri.Seek(ctx, seekKey) { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to // for each RPC individually). On RPC errors, there's no guarantee // that the request hasn't made its way to the target regardless // of the error; we'd like the second execution to be caught by // the sequence cache if that happens. There is a small chance // that we address a range twice in this chunk (stale/suboptimal // descriptors due to splits/merges) which leads to a transaction // retry. // // TODO(tschottdorf): it's possible that if we don't evict from // the cache we could be in for a busy loop. ba.SetNewRequest() responseCh := make(chan response, 1) responseChs = append(responseChs, responseCh) if isFirst && ri.NeedAnother(rs) { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { responseCh <- response{pErr: roachpb.NewError(&roachpb.OpRequiresTxnError{})} return } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { responseCh <- response{pErr: errNo1PCTxn} return } } // Determine next seek key, taking a potentially sparse batch into // consideration. var err error nextRS := rs if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. seekKey, err = prev(ba, ri.Desc().StartKey) nextRS.EndKey = seekKey } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. seekKey, err = next(ba, ri.Desc().EndKey) nextRS.Key = seekKey } if err != nil { responseCh <- response{pErr: roachpb.NewError(err)} return } // Send the next partial batch to the first range in the "rs" span. // If we're not handling a request which limits responses and we // can reserve one of the limited goroutines available for parallel // batch RPCs, send asynchronously. if ba.MaxSpanRequestKeys == 0 && ri.NeedAnother(rs) && ds.rpcContext != nil && ds.sendPartialBatchAsync(ctx, ba, rs, ri.Desc(), ri.Token(), isFirst, responseCh) { // Note that we pass the batch request by value to the parallel // goroutine to avoid using the cloned txn. // Clone the txn to preserve the current txn sequence for the async call. if ba.Txn != nil { txnClone := ba.Txn.Clone() ba.Txn = &txnClone } } else { // Send synchronously if there is no parallel capacity left, there's a // max results limit, or this is the final request in the span. resp := ds.sendPartialBatch(ctx, ba, rs, ri.Desc(), ri.Token(), isFirst) responseCh <- resp if resp.pErr != nil { return } ba.UpdateTxn(resp.reply.Txn) // Check whether we've received enough responses to exit query loop. if ba.MaxSpanRequestKeys > 0 { var numResults int64 for _, r := range resp.reply.Responses { numResults += r.GetInner().Header().NumKeys } if numResults > ba.MaxSpanRequestKeys { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxSpanRequestKeys)) } ba.MaxSpanRequestKeys -= numResults // Exiting; fill in missing responses. if ba.MaxSpanRequestKeys == 0 { fillSkippedResponses(ba, resp.reply, seekKey) return } } } // Check for completion. if !ri.NeedAnother(rs) { return } isFirst = false // next range will not be first! rs = nextRS } // We've exited early. Return the range iterator error. responseCh := make(chan response, 1) responseCh <- response{pErr: ri.Error()} responseChs = append(responseChs, responseCh) return }
// Send implements the batch.Sender interface. It subdivides the Batch // into batches admissible for sending (preventing certain illegal // mixtures of requests), executes each individual part (which may // span multiple ranges), and recombines the response. // // When the request spans ranges, it is split by range and a partial // subset of the batch request is sent to affected ranges in parallel. // // The first write in a transaction may not arrive before writes to // other ranges. This is relevant in the case of a BeginTransaction // request. Intents written to other ranges before the transaction // record is created will cause the transaction to abort early. func (ds *DistSender) Send( ctx context.Context, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, *roachpb.Error) { tracing.AnnotateTrace() if pErr := ds.initAndVerifyBatch(ctx, &ba); pErr != nil { return nil, pErr } ctx = ds.AnnotateCtx(ctx) ctx, cleanup := tracing.EnsureContext(ctx, ds.AmbientContext.Tracer) defer cleanup() var rplChunks []*roachpb.BatchResponse parts := ba.Split(false /* don't split ET */) if len(parts) > 1 && ba.MaxSpanRequestKeys != 0 { // We already verified above that the batch contains only scan requests of the same type. // Such a batch should never need splitting. panic("batch with MaxSpanRequestKeys needs splitting") } for len(parts) > 0 { part := parts[0] ba.Requests = part // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err) } rpl, pErr := ds.divideAndSendBatchToRanges(ctx, ba, rs, true /* isFirst */) if pErr == errNo1PCTxn { // If we tried to send a single round-trip EndTransaction but // it looks like it's going to hit multiple ranges, split it // here and try again. if len(parts) != 1 { panic("EndTransaction not in last chunk of batch") } parts = ba.Split(true /* split ET */) if len(parts) != 2 { panic("split of final EndTransaction chunk resulted in != 2 parts") } continue } if pErr != nil { return nil, pErr } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.UpdateTxn(rpl.Txn) rplChunks = append(rplChunks, rpl) parts = parts[1:] } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) reply.CollectedSpans = append(reply.CollectedSpans, rpl.CollectedSpans...) } reply.BatchResponse_Header = rplChunks[len(rplChunks)-1].BatchResponse_Header return reply, nil }