// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). func (ds *DistSender) sendChunk(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, error) { // TODO(tschottdorf): prepare for removing Key and EndKey from BatchRequest, // making sure that anything that relies on them goes bust. ba.Key, ba.EndKey = nil, nil isReverse := ba.IsReverse() trace := tracer.FromCtx(ctx) // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). from, to := keys.Range(ba) var br *proto.BatchResponse // Send the request to one range per iteration. for { options := lookupOptions{ useReverseScan: isReverse, } var curReply *proto.BatchResponse var desc *proto.RangeDescriptor var needAnother bool var err error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") var evictDesc func() desc, needAnother, evictDesc, err = ds.getDescriptors(from, to, options) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(retry.Retryable); ok && rErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } break } // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if needAnother && ba.Txn == nil && ba.IsRange() && ba.ReadConsistency != proto.INCONSISTENT { return nil, &proto.OpRequiresTxnError{} } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. if (isReverse && !desc.ContainsKeyRange(desc.StartKey, to)) || (!isReverse && !desc.ContainsKeyRange(from, desc.EndKey)) { evictDesc() continue } curReply, err = func() (*proto.BatchResponse, error) { // Truncate the request to our current key range. untruncate, numActive, trErr := truncate(&ba, desc, from, to) if numActive == 0 { untruncate() // This shouldn't happen in the wild, but some tests // exercise it. return nil, util.Errorf("truncation resulted in empty batch on [%s,%s): %s", from, to, ba) } defer untruncate() if trErr != nil { return nil, trErr } // TODO(tschottdorf): make key range on batch redundant. The // requests within dictate it anyways. ba.Key, ba.EndKey = keys.Range(ba) reply, err := ds.sendAttempt(trace, ba, desc) ba.Key, ba.EndKey = nil, nil if err != nil { if log.V(0 /* TODO(tschottdorf): 1 */) { log.Warningf("failed to invoke %s: %s", ba, err) } } return reply, err }() // If sending succeeded, break this loop. if err == nil { break } // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *rpc.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. // TODO(tschottdorf): If a replica group goes dead, this // will cause clients to put high read pressure on the first // range, so there should be some rate limiting here. evictDesc() if tErr.CanRetry() { continue } case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: trace.Event(fmt.Sprintf("reply error: %T", err)) // Range descriptor might be out of date - evict it. evictDesc() // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(err) } // For the remainder of this call, we'll assume that intents // are fair game. This replaces more complex logic based on // the type of request. options.considerIntents = true continue case *proto.NotLeaderError: trace.Event(fmt.Sprintf("reply error: %T", err)) newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } evictDesc() } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(err) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(err) } trace.Event(fmt.Sprintf("reply error: %T", err)) continue } } break } // Immediately return if querying a range failed non-retryably. if err != nil { return nil, err } first := br == nil if first { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { panic(err) // TODO(tschottdorf): return nil, err } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if first { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]proto.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetValue() if _, ok := args.(*proto.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(proto.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetValue().(proto.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). ba.Requests[i].Reset() // necessary (no one-of?) if !ba.Requests[i].SetValue(&proto.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. to = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. from = next(ba, desc.EndKey) } trace.Event("querying next range") } }