func testRangeCacheHandleDoubleSplit(t *testing.T, useReverseScan bool) { db := initTestDescriptorDB(t) db.disablePrefetch = true // A request initially looks up the range descriptor ["a"-"b"). doLookup(t, db.cache, "aa") db.assertLookupCountEq(t, 2, "aa") // A split breaks up the range into ["a"-"an"), ["an"-"at"), ["at"-"b"). db.splitRange(t, roachpb.RKey("an")) db.splitRange(t, roachpb.RKey("at")) // A request is sent to the stale descriptor on the right half // such that a RangeKeyMismatchError is returned. _, evictToken := doLookup(t, db.cache, "az") // mismatchErrRange mocks out a RangeKeyMismatchError.Range response. ranges, _, pErr := db.getDescriptors(roachpb.RKey("aa"), false, false) if pErr != nil { t.Fatal(pErr) } mismatchErrRange := ranges[0] // The stale descriptor is evicted, the new descriptor from the error is // replaced, and a new lookup is initialized. if err := evictToken.EvictAndReplace(context.Background(), mismatchErrRange); err != nil { t.Fatal(err) } // Requests to all parts of the split are sent: // [reverse case] // - "aa" and "an" will hit the cache // - all others will join a coalesced request to "az" // + will lookup the meta2 desc // + will lookup the ["at"-"b") desc // - "az" will get the right range back // - "at" will make a second lookup // + will lookup the ["an"-"at") desc // // [non-reverse case] // - "aa" will hit the cache // - all others will join a coalesced request to "an" // + will lookup the meta2 desc // + will lookup the ["an"-"at") desc // - "an" and "ao" will get the right range back // - "at" and "az" will make a second lookup // + will lookup the ["at"-"b") desc var wg, waitJoin sync.WaitGroup db.pauseRangeLookups() numRetries := int64(0) for _, k := range []string{"aa", "an", "ao", "at", "az"} { wg.Add(1) waitJoin.Add(1) go func(key roachpb.RKey) { reqEvictToken := evictToken waitJoinCopied := &waitJoin var desc *roachpb.RangeDescriptor for { // Each request goes to a different key. var err error if desc, reqEvictToken, err = db.cache.lookupRangeDescriptorInternal( context.Background(), key, reqEvictToken, false, /* considerIntents */ useReverseScan, waitJoinCopied); err != nil { waitJoinCopied = nil atomic.AddInt64(&numRetries, 1) continue } break } if useReverseScan { if !desc.ContainsExclusiveEndKey(key) { t.Errorf("desc %s does not contain exclusive end key %s", desc, key) } } else { if !desc.ContainsKey(key) { t.Errorf("desc %s does not contain key %s", desc, key) } } wg.Done() }(roachpb.RKey(k)) } // Wait until all lookup requests hit the cache or join into a coalesced request. waitJoin.Wait() db.resumeRangeLookups() wg.Wait() db.assertLookupCountEq(t, 3, "an and az") if numRetries == 0 { t.Error("expected retry on desc lookup") } // All three descriptors are now correctly cached. doLookup(t, db.cache, "aa") db.assertLookupCountEq(t, 0, "aa") doLookup(t, db.cache, "ao") db.assertLookupCountEq(t, 0, "ao") doLookup(t, db.cache, "az") db.assertLookupCountEq(t, 0, "az") }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() // TODO(radu): when contexts are properly plumbed, we should be able to get // the tracer from ctx, not from the DistSender. ctx, cleanup := tracing.EnsureContext(ctx, tracing.TracerFromCtx(ds.Ctx)) defer cleanup() // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err), false } var br *roachpb.BatchResponse // Send the request to one range per iteration. for { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to for // each RPC individually). On RPC errors, there's no guarantee that // the request hasn't made its way to the target regardless of the // error; we'd like the second execution to be caught by the sequence // cache if that happens. There is a small chance that that we address // a range twice in this chunk (stale/suboptimal descriptors due to // splits/merges) which leads to a transaction retry. // TODO(tschottdorf): it's possible that if we don't evict from the // cache we could be in for a busy loop. ba.SetNewRequest() var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var evictToken *evictionToken var needAnother bool var pErr *roachpb.Error var finished bool var numAttempts int for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { numAttempts++ { const magicLogCurAttempt = 20 var seq int32 if ba.Txn != nil { seq = ba.Txn.Sequence } if numAttempts%magicLogCurAttempt == 0 || seq%magicLogCurAttempt == 0 { // Log a message if a request appears to get stuck for a long // time or, potentially, forever. See #8975. // The local counter captures this loop here; the Sequence number // should capture anything higher up (as it needs to be // incremented every time this method is called). log.Warningf( ctx, "%d retries for an RPC at sequence %d, last error was: %s, remaining key ranges %s: %s", numAttempts, seq, pErr, rs, ba, ) } } // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. log.Trace(ctx, "meta descriptor lookup") var err error desc, needAnother, evictToken, err = ds.getDescriptors(ctx, rs, evictToken, isReverse) // getDescriptors may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.Trace(ctx, "range descriptor lookup failed: "+err.Error()) if log.V(1) { log.Warning(ctx, err) } pErr = roachpb.NewError(err) continue } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool { if isReverse { return desc.ContainsExclusiveEndKey(rs.EndKey) } return desc.ContainsKey(rs.Key) } if !includesFrontOfCurSpan(desc) { if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(ctx, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } log.VTracef(1, ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(tErr.SuggestedRange) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(ctx, tErr) } continue } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false case <-ctx.Done(): return nil, roachpb.NewError(ctx.Err()), false default: log.Fatal(ctx, "exited retry loop with nil error but finished=false") } } ba.UpdateTxn(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey, err = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key, err = next(ba, desc.EndKey) } if err != nil { return nil, roachpb.NewError(err), false } if ba.MaxSpanRequestKeys > 0 { // Count how many results we received. var numResults int64 for _, resp := range curReply.Responses { numResults += resp.GetInner().Header().NumKeys } if numResults > ba.MaxSpanRequestKeys { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxSpanRequestKeys)) } ba.MaxSpanRequestKeys -= numResults if ba.MaxSpanRequestKeys == 0 { // prepare the batch response after meeting the max key limit. fillSkippedResponses(ba, br, rs) // done, exit loop. return br, nil, false } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } // key cannot be less that the end key. if !rs.Key.Less(rs.EndKey) { panic(fmt.Sprintf("start key %s is less than %s", rs.Key, rs.EndKey)) } log.Trace(ctx, "querying next range") } }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() ctx, cleanup := tracing.EnsureContext(ctx, ds.Tracer) defer cleanup() // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err), false } var br *roachpb.BatchResponse // Send the request to one range per iteration. for { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to for // each RPC individually). On RPC errors, there's no guarantee that // the request hasn't made its way to the target regardless of the // error; we'd like the second execution to be caught by the sequence // cache if that happens. There is a small chance that that we address // a range twice in this chunk (stale/suboptimal descriptors due to // splits/merges) which leads to a transaction retry. // TODO(tschottdorf): it's possible that if we don't evict from the // cache we could be in for a busy loop. ba.SetNewRequest() var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var evictToken *evictionToken var needAnother bool var pErr *roachpb.Error var finished bool for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. log.Trace(ctx, "meta descriptor lookup") var err error desc, needAnother, evictToken, err = ds.getDescriptors(ctx, rs, evictToken, isReverse) // getDescriptors may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.Trace(ctx, "range descriptor lookup failed: "+err.Error()) if log.V(1) { log.Warning(err) } pErr = roachpb.NewError(err) continue } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool { if isReverse { return desc.ContainsExclusiveEndKey(rs.EndKey) } return desc.ContainsKey(rs.Key) } if !includesFrontOfCurSpan(desc) { if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(ctx, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } log.Trace(ctx, fmt.Sprintf("reply error: %s", pErr)) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(tErr.SuggestedRange) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } continue } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false default: log.Fatal("exited retry loop with nil error but finished=false") } } ba.Txn.Update(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } if ba.MaxScanResults > 0 { // Count how many results we received. var numResults int64 for _, resp := range curReply.Responses { if cResp, ok := resp.GetInner().(roachpb.Countable); ok { numResults += cResp.Count() } } if numResults > ba.MaxScanResults { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxScanResults)) } ba.MaxScanResults -= numResults if ba.MaxScanResults == 0 { // We are done with this batch. Some requests might have NoopResponses; we must // replace them with empty responses of the proper type. for i, req := range ba.Requests { if _, ok := br.Responses[i].GetInner().(*roachpb.NoopResponse); !ok { continue } union := roachpb.ResponseUnion{} var reply roachpb.Response if _, ok := req.GetInner().(*roachpb.ScanRequest); ok { reply = &roachpb.ScanResponse{} } else { _ = req.GetInner().(*roachpb.ReverseScanRequest) reply = &roachpb.ReverseScanResponse{} } union.MustSetInner(reply) br.Responses[i] = union } return br, nil, false } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if br == nil { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*roachpb.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(roachpb.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). union := &ba.Requests[i] // avoid working on copy union.MustSetInner(&noopRequest) continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey, err = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key, err = next(ba, desc.EndKey) } if err != nil { return nil, roachpb.NewError(err), false } log.Trace(ctx, "querying next range") } }