// fillSkippedResponses after meeting the batch key max limit for range // requests. func fillSkippedResponses(ba roachpb.BatchRequest, br *roachpb.BatchResponse, nextKey roachpb.RKey) { // Some requests might have NoopResponses; we must replace them with empty // responses of the proper type. for i, req := range ba.Requests { if _, ok := br.Responses[i].GetInner().(*roachpb.NoopResponse); !ok { continue } var reply roachpb.Response switch t := req.GetInner().(type) { case *roachpb.ScanRequest: reply = &roachpb.ScanResponse{} case *roachpb.ReverseScanRequest: reply = &roachpb.ReverseScanResponse{} case *roachpb.DeleteRangeRequest: reply = &roachpb.DeleteRangeResponse{} case *roachpb.BeginTransactionRequest, *roachpb.EndTransactionRequest: continue default: panic(fmt.Sprintf("bad type %T", t)) } union := roachpb.ResponseUnion{} union.MustSetInner(reply) br.Responses[i] = union } // Set the ResumeSpan for future batch requests. isReverse := ba.IsReverse() for i, resp := range br.Responses { req := ba.Requests[i].GetInner() if !roachpb.IsRange(req) { continue } hdr := resp.GetInner().Header() origSpan := req.Header() if isReverse { if hdr.ResumeSpan != nil { // The ResumeSpan.Key might be set to the StartKey of a range; // correctly set it to the Key of the original request span. hdr.ResumeSpan.Key = origSpan.Key } else if roachpb.RKey(origSpan.Key).Less(nextKey) { // Some keys have yet to be processed. hdr.ResumeSpan = &origSpan if nextKey.Less(roachpb.RKey(origSpan.EndKey)) { // The original span has been partially processed. hdr.ResumeSpan.EndKey = nextKey.AsRawKey() } } } else { if hdr.ResumeSpan != nil { // The ResumeSpan.EndKey might be set to the EndKey of a // range; correctly set it to the EndKey of the original // request span. hdr.ResumeSpan.EndKey = origSpan.EndKey } else if nextKey.Less(roachpb.RKey(origSpan.EndKey)) { // Some keys have yet to be processed. hdr.ResumeSpan = &origSpan if roachpb.RKey(origSpan.Key).Less(nextKey) { // The original span has been partially processed. hdr.ResumeSpan.Key = nextKey.AsRawKey() } } } br.Responses[i].GetInner().SetHeader(hdr) } }
// divideAndSendBatchToRanges sends the supplied batch to all of the // ranges which comprise the span specified by rs. The batch request // is trimmed against each range which is part of the span and sent // either serially or in parallel, if possible. isFirst indicates // whether this is the first time this method has been called on the // batch. It's specified false where this method is invoked recursively. func (ds *DistSender) divideAndSendBatchToRanges( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, isFirst bool, ) (br *roachpb.BatchResponse, pErr *roachpb.Error) { // This function builds a channel of responses for each range // implicated in the span (rs) and combines them into a single // BatchResponse when finished. var responseChs []chan response defer func() { for _, responseCh := range responseChs { resp := <-responseCh if resp.pErr != nil { if pErr == nil { pErr = resp.pErr } continue } if br == nil { // First response from a Range. br = resp.reply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(resp.reply); err != nil { pErr = roachpb.NewError(err) return } br.Txn.Update(resp.reply.Txn) } } // If we experienced an error, don't neglect to update the error's // attached transaction with any responses which were received. if pErr != nil { if br != nil { pErr.UpdateTxn(br.Txn) } } }() // Get initial seek key depending on direction of iteration. var seekKey roachpb.RKey isReverse := ba.IsReverse() if isReverse { seekKey = rs.EndKey } else { seekKey = rs.Key } // Send the request to one range per iteration. ri := NewRangeIterator(ds, isReverse) for ri.Seek(ctx, seekKey); ri.Valid(); ri.Seek(ctx, seekKey) { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to // for each RPC individually). On RPC errors, there's no guarantee // that the request hasn't made its way to the target regardless // of the error; we'd like the second execution to be caught by // the sequence cache if that happens. There is a small chance // that we address a range twice in this chunk (stale/suboptimal // descriptors due to splits/merges) which leads to a transaction // retry. // // TODO(tschottdorf): it's possible that if we don't evict from // the cache we could be in for a busy loop. ba.SetNewRequest() responseCh := make(chan response, 1) responseChs = append(responseChs, responseCh) if isFirst && ri.NeedAnother(rs) { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { responseCh <- response{pErr: roachpb.NewError(&roachpb.OpRequiresTxnError{})} return } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { responseCh <- response{pErr: errNo1PCTxn} return } } // Determine next seek key, taking a potentially sparse batch into // consideration. var err error nextRS := rs if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. seekKey, err = prev(ba, ri.Desc().StartKey) nextRS.EndKey = seekKey } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. seekKey, err = next(ba, ri.Desc().EndKey) nextRS.Key = seekKey } if err != nil { responseCh <- response{pErr: roachpb.NewError(err)} return } // Send the next partial batch to the first range in the "rs" span. // If we're not handling a request which limits responses and we // can reserve one of the limited goroutines available for parallel // batch RPCs, send asynchronously. if ba.MaxSpanRequestKeys == 0 && ri.NeedAnother(rs) && ds.rpcContext != nil && ds.sendPartialBatchAsync(ctx, ba, rs, ri.Desc(), ri.Token(), isFirst, responseCh) { // Note that we pass the batch request by value to the parallel // goroutine to avoid using the cloned txn. // Clone the txn to preserve the current txn sequence for the async call. if ba.Txn != nil { txnClone := ba.Txn.Clone() ba.Txn = &txnClone } } else { // Send synchronously if there is no parallel capacity left, there's a // max results limit, or this is the final request in the span. resp := ds.sendPartialBatch(ctx, ba, rs, ri.Desc(), ri.Token(), isFirst) responseCh <- resp if resp.pErr != nil { return } ba.UpdateTxn(resp.reply.Txn) // Check whether we've received enough responses to exit query loop. if ba.MaxSpanRequestKeys > 0 { var numResults int64 for _, r := range resp.reply.Responses { numResults += r.GetInner().Header().NumKeys } if numResults > ba.MaxSpanRequestKeys { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxSpanRequestKeys)) } ba.MaxSpanRequestKeys -= numResults // Exiting; fill in missing responses. if ba.MaxSpanRequestKeys == 0 { fillSkippedResponses(ba, resp.reply, seekKey) return } } } // Check for completion. if !ri.NeedAnother(rs) { return } isFirst = false // next range will not be first! rs = nextRS } // We've exited early. Return the range iterator error. responseCh := make(chan response, 1) responseCh <- response{pErr: ri.Error()} responseChs = append(responseChs, responseCh) return }
// sendPartialBatch sends the supplied batch to the range specified by // desc. The batch request is first truncated so that it contains only // requests which intersect the range descriptor and keys for each // request are limited to the range's key span. The send occurs in a // retry loop to handle send failures. On failure to send to any // replicas, we backoff and retry by refetching the range // descriptor. If the underlying range seems to have split, we // recursively invoke divideAndSendBatchToRanges to re-enumerate the // ranges in the span and resend to each. func (ds *DistSender) sendPartialBatch( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, desc *roachpb.RangeDescriptor, evictToken *EvictionToken, isFirst bool, ) response { var reply *roachpb.BatchResponse var pErr *roachpb.Error isReverse := ba.IsReverse() // Truncate the request to range descriptor. intersected, err := rs.Intersect(desc) if err != nil { return response{pErr: roachpb.NewError(err)} } truncBA, numActive, err := truncate(ba, intersected) if numActive == 0 && err == nil { // This shouldn't happen in the wild, but some tests exercise it. return response{ pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", intersected, ba), } } if err != nil { return response{pErr: roachpb.NewError(err)} } // Start a retry loop for sending the batch to the range. for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { // If we've cleared the descriptor on a send failure, re-lookup. if desc == nil { var descKey roachpb.RKey if isReverse { descKey = intersected.EndKey } else { descKey = intersected.Key } desc, evictToken, err = ds.getDescriptor(ctx, descKey, nil, isReverse) if err != nil { log.ErrEventf(ctx, "range descriptor re-lookup failed: %s", err) continue } } reply, pErr = ds.sendSingleRange(ctx, truncBA, desc) // If sending succeeded, return immediately. if pErr == nil { return response{reply: reply} } log.ErrEventf(ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. log.Event(ctx, "evicting range descriptor on send error and backoff for re-lookup") if err := evictToken.Evict(ctx); err != nil { return response{pErr: roachpb.NewError(err)} } // Clear the descriptor to reload on the next attempt. desc = nil continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return response{pErr: roachpb.NewError(err)} } // On addressing errors (likely a split), we need to re-invoke // the range descriptor lookup machinery, so we recurse by // sending batch to just the partial span this descriptor was // supposed to cover. log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr) reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, intersected, isFirst) return response{reply: reply, pErr: pErr} } break } // Propagate error if either the retry closer or context done // channels were closed. if pErr == nil { if pErr = ds.deduceRetryEarlyExitError(ctx); pErr == nil { log.Fatal(ctx, "exited retry loop without an error") } } return response{pErr: pErr} }
// initAndVerifyBatch initializes timestamp-related information and // verifies batch constraints before splitting. func (ds *DistSender) initAndVerifyBatch( ctx context.Context, ba *roachpb.BatchRequest, ) *roachpb.Error { // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(hlc.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil { // Make a copy here since the code below modifies it in different places. // TODO(tschottdorf): be smarter about this - no need to do it for // requests that don't get split. txnClone := ba.Txn.Clone() ba.Txn = &txnClone if len(ba.Txn.ObservedTimestamps) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). // // Taking care below to not modify any memory referenced from // our BatchRequest which may be shared with others. // // We already have a clone of our txn (see above), so we can // modify it freely. // // Zero the existing data. That makes sure that if we had // something of size zero but with capacity, we don't re-use the // existing space (which others may also use). This is just to // satisfy paranoia/OCD and not expected to matter in practice. ba.Txn.ResetObservedTimestamps() // OrigTimestamp is the HLC timestamp at which the Txn started, so // this effectively means no more uncertainty on this node. ba.Txn.UpdateObservedTimestamp(nDesc.NodeID, ba.Txn.OrigTimestamp) } } } if len(ba.Requests) < 1 { return roachpb.NewErrorf("empty batch") } if ba.MaxSpanRequestKeys != 0 { // Verify that the batch contains only specific range requests or the // Begin/EndTransactionRequest. Verify that a batch with a ReverseScan // only contains ReverseScan range requests. isReverse := ba.IsReverse() for _, req := range ba.Requests { inner := req.GetInner() switch inner.(type) { case *roachpb.ScanRequest, *roachpb.DeleteRangeRequest: // Accepted range requests. All other range requests are still // not supported. // TODO(vivek): don't enumerate all range requests. if isReverse { return roachpb.NewErrorf("batch with limit contains both forward and reverse scans") } case *roachpb.BeginTransactionRequest, *roachpb.EndTransactionRequest, *roachpb.ReverseScanRequest: continue default: return roachpb.NewErrorf("batch with limit contains %T request", inner) } } } return nil }