// Send implements Sender. // TODO(tschottdorf): We actually don't want to chop EndTransaction off for // single-range requests (but that happens now since EndTransaction has the // isAlone flag). Whether it is one or not is unknown right now (you can only // find out after you've sent to the Range/looked up a descriptor that suggests // that you're multi-range. In those cases, the wrapped sender should return an // error so that we split and retry once the chunk which contains // EndTransaction (i.e. the last one). func (cs *chunkingSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if len(ba.Requests) < 1 { panic("empty batch") } parts := ba.Split() var rplChunks []*roachpb.BatchResponse for _, part := range parts { ba.Requests = part // Increase the sequence counter to account for the fact that while // chunking, we're likely sending multiple requests to the same Replica. ba.SetNewRequest() rpl, err := cs.f(ctx, ba) if err != nil { return nil, err } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.Txn.Update(rpl.Header().Txn) rplChunks = append(rplChunks, rpl) } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) } lastHeader := rplChunks[len(rplChunks)-1].BatchResponse_Header reply.Error = lastHeader.Error reply.Timestamp = lastHeader.Timestamp reply.Txn = ba.Txn return reply, nil }
// sendAndFill is a helper which sends the given batch and fills its results, // returning the appropriate error which is either from the first failing call, // or an "internal" error. func sendAndFill( send func(roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error), b *Batch, ) error { // Errors here will be attached to the results, so we will get them from // the call to fillResults in the regular case in which an individual call // fails. But send() also returns its own errors, so there's some dancing // here to do because we want to run fillResults() so that the individual // result gets initialized with an error from the corresponding call. var ba roachpb.BatchRequest // TODO(tschottdorf): this nonsensical copy is required since (at least at // the time of writing, the chunking and masking in DistSender operates on // the original data (as attested to by a whole bunch of test failures). ba.Requests = append([]roachpb.RequestUnion(nil), b.reqs...) ba.Header = b.Header b.response, b.pErr = send(ba) if b.pErr != nil { // Discard errors from fillResults. _ = b.fillResults() return b.pErr.GoError() } if err := b.fillResults(); err != nil { b.pErr = roachpb.NewError(err) return err } return nil }
// Send implements Sender. // TODO(tschottdorf): We actually don't want to chop EndTransaction off for // single-range requests (but that happens now since EndTransaction has the // isAlone flag). Whether it is one or not is unknown right now (you can only // find out after you've sent to the Range/looked up a descriptor that suggests // that you're multi-range. In those cases, the wrapped sender should return an // error so that we split and retry once the chunk which contains // EndTransaction (i.e. the last one). func (cs *chunkingSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if len(ba.Requests) < 1 { panic("empty batch") } // Deterministically create ClientCmdIDs for all parts of the batch if // a CmdID is already set (otherwise, leave them empty). var nextID func() roachpb.ClientCmdID empty := roachpb.ClientCmdID{} if empty == ba.CmdID { nextID = func() roachpb.ClientCmdID { return empty } } else { rng := rand.New(rand.NewSource(ba.CmdID.Random)) id := ba.CmdID nextID = func() roachpb.ClientCmdID { curID := id // copy id.Random = rng.Int63() // adjust for next call return curID } } parts := ba.Split() var rplChunks []*roachpb.BatchResponse for _, part := range parts { ba.Requests = part ba.CmdID = nextID() rpl, err := cs.f(ctx, ba) if err != nil { return nil, err } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.Txn.Update(rpl.Header().Txn) rplChunks = append(rplChunks, rpl) } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) } lastHeader := rplChunks[len(rplChunks)-1].BatchResponse_Header reply.Error = lastHeader.Error reply.Timestamp = lastHeader.Timestamp reply.Txn = ba.Txn return reply, nil }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() trace := tracer.FromCtx(ctx) // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs := keys.Range(ba) var br *roachpb.BatchResponse // Send the request to one range per iteration. for { considerIntents := false var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var needAnother bool var pErr *roachpb.Error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") var evictDesc func() desc, needAnother, evictDesc, pErr = ds.getDescriptors(rs, considerIntents, isReverse) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if pErr != nil { if pErr.Retryable { if log.V(1) { log.Warning(pErr) } continue } break } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. if (isReverse && !desc.ContainsKeyRange(desc.StartKey, rs.EndKey)) || (!isReverse && !desc.ContainsKeyRange(rs.Key, desc.EndKey)) { evictDesc() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(trace, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { break } if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } trace.Event(fmt.Sprintf("reply error: %T", pErr.GoError())) // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := pErr.GoError().(type) { case *roachpb.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. // TODO(tschottdorf): If a replica group goes dead, this // will cause clients to put high read pressure on the first // range, so there should be some rate limiting here. evictDesc() if tErr.CanRetry() { continue } case *roachpb.RangeNotFoundError, *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. evictDesc() // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } // On retries, allow [uncommitted] intents on range descriptor // lookups to be returned 50% of the time in order to succeed // at finding the transaction record pointed to by the intent // itself. The 50% probability of returning either the current // intent or the previously committed value balances between // the two cases where the intent's txn hasn't yet been // committed (the previous value is correct), or the intent's // txn has been committed (the intent value is correct). considerIntents = true continue case *roachpb.NotLeaderError: newLeader := tErr.Leader // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } evictDesc() } } else { newLeader = &roachpb.ReplicaDescriptor{} } ds.updateLeaderCache(roachpb.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(tErr) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(tErr) } continue } } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } ba.Txn.Update(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if br == nil { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*roachpb.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(roachpb.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). ba.Requests[i].Reset() // necessary (no one-of?) if !ba.Requests[i].SetValue(&roachpb.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key = next(ba, desc.EndKey) } trace.Event("querying next range") } }
// Send implements the batch.Sender interface. It subdivides // the Batch into batches admissible for sending (preventing certain // illegal mixtures of requests), executes each individual part // (which may span multiple ranges), and recombines the response. // When the request spans ranges, it is split up and the corresponding // ranges queried serially, in ascending order. // In particular, the first write in a transaction may not be part of the first // request sent. This is relevant since the first write is a BeginTransaction // request, thus opening up a window of time during which there may be intents // of a transaction, but no entry. Pushing such a transaction will succeed, and // may lead to the transaction being aborted early. func (ds *DistSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(roachpb.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil && len(ba.Txn.CertainNodes.Nodes) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): bad style to assume that ba.Txn is ours. // No race here, but should have a better way of doing this. // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). ba.Txn.CertainNodes.Add(nDesc.NodeID) } } if len(ba.Requests) < 1 { panic("empty batch") } var rplChunks []*roachpb.BatchResponse parts := ba.Split(false /* don't split ET */) for len(parts) > 0 { part := parts[0] ba.Requests = part rpl, pErr, shouldSplitET := ds.sendChunk(ctx, ba) if shouldSplitET { // If we tried to send a single round-trip EndTransaction but // it looks like it's going to hit multiple ranges, split it // here and try again. if len(parts) != 1 { panic("EndTransaction not in last chunk of batch") } parts = ba.Split(true /* split ET */) if len(parts) != 2 { panic("split of final EndTransaction chunk resulted in != 2 parts") } continue } if pErr != nil { return nil, pErr } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.Txn.Update(rpl.Header().Txn) rplChunks = append(rplChunks, rpl) parts = parts[1:] } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) } *reply.Header() = rplChunks[len(rplChunks)-1].BatchResponse_Header return reply, nil }
// Send implements the batch.Sender interface. It subdivides // the Batch into batches admissible for sending (preventing certain // illegal mixtures of requests), executes each individual part // (which may span multiple ranges), and recombines the response. // When the request spans ranges, it is split up and the corresponding // ranges queried serially, in ascending order. // In particular, the first write in a transaction may not be part of the first // request sent. This is relevant since the first write is a BeginTransaction // request, thus opening up a window of time during which there may be intents // of a transaction, but no entry. Pushing such a transaction will succeed, and // may lead to the transaction being aborted early. func (ds *DistSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { tracing.AnnotateTrace() // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(hlc.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil { // Make a copy here since the code below modifies it in different places. // TODO(tschottdorf): be smarter about this - no need to do it for // requests that don't get split. txnClone := ba.Txn.Clone() ba.Txn = &txnClone if len(ba.Txn.ObservedTimestamps) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). // // Taking care below to not modify any memory referenced from // our BatchRequest which may be shared with others. // // We already have a clone of our txn (see above), so we can // modify it freely. // // Zero the existing data. That makes sure that if we had // something of size zero but with capacity, we don't re-use the // existing space (which others may also use). This is just to // satisfy paranoia/OCD and not expected to matter in practice. ba.Txn.ResetObservedTimestamps() // OrigTimestamp is the HLC timestamp at which the Txn started, so // this effectively means no more uncertainty on this node. ba.Txn.UpdateObservedTimestamp(nDesc.NodeID, ba.Txn.OrigTimestamp) } } } if len(ba.Requests) < 1 { panic("empty batch") } if ba.MaxSpanRequestKeys != 0 { // Verify that the batch contains only specific range requests or the // Begin/EndTransactionRequest. Verify that a batch with a ReverseScan // only contains ReverseScan range requests. isReverse := ba.IsReverse() for _, req := range ba.Requests { inner := req.GetInner() switch inner.(type) { case *roachpb.ScanRequest, *roachpb.DeleteRangeRequest: // Accepted range requests. All other range requests are still // not supported. // TODO(vivek): don't enumerate all range requests. if isReverse { return nil, roachpb.NewErrorf("batch with limit contains both forward and reverse scans") } case *roachpb.BeginTransactionRequest, *roachpb.EndTransactionRequest, *roachpb.ReverseScanRequest: continue default: return nil, roachpb.NewErrorf("batch with limit contains %T request", inner) } } } var rplChunks []*roachpb.BatchResponse parts := ba.Split(false /* don't split ET */) if len(parts) > 1 && ba.MaxSpanRequestKeys != 0 { // We already verified above that the batch contains only scan requests of the same type. // Such a batch should never need splitting. panic("batch with MaxSpanRequestKeys needs splitting") } for len(parts) > 0 { part := parts[0] ba.Requests = part rpl, pErr, shouldSplitET := ds.sendChunk(ctx, ba) if shouldSplitET { // If we tried to send a single round-trip EndTransaction but // it looks like it's going to hit multiple ranges, split it // here and try again. if len(parts) != 1 { panic("EndTransaction not in last chunk of batch") } parts = ba.Split(true /* split ET */) if len(parts) != 2 { panic("split of final EndTransaction chunk resulted in != 2 parts") } continue } if pErr != nil { return nil, pErr } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.UpdateTxn(rpl.Txn) rplChunks = append(rplChunks, rpl) parts = parts[1:] } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) reply.CollectedSpans = append(reply.CollectedSpans, rpl.CollectedSpans...) } reply.BatchResponse_Header = rplChunks[len(rplChunks)-1].BatchResponse_Header return reply, nil }
// truncate restricts all contained requests to the given key range // and returns a new BatchRequest. // All requests contained in that batch are "truncated" to the given // span, inserting NoopRequest appropriately to replace requests which // are left without a key range to operate on. The number of non-noop // requests after truncation is returned. func truncate(ba roachpb.BatchRequest, rs roachpb.RSpan) (roachpb.BatchRequest, int, error) { truncateOne := func(args roachpb.Request) (bool, roachpb.Span, error) { if _, ok := args.(*roachpb.NoopRequest); ok { return true, emptySpan, nil } header := args.Header() if !roachpb.IsRange(args) { // This is a point request. if len(header.EndKey) > 0 { return false, emptySpan, errors.Errorf("%T is not a range command, but EndKey is set", args) } keyAddr, err := keys.Addr(header.Key) if err != nil { return false, emptySpan, err } if !rs.ContainsKey(keyAddr) { return false, emptySpan, nil } return true, header, nil } // We're dealing with a range-spanning request. local := false keyAddr, err := keys.Addr(header.Key) if err != nil { return false, emptySpan, err } endKeyAddr, err := keys.Addr(header.EndKey) if err != nil { return false, emptySpan, err } if l, r := !keyAddr.Equal(header.Key), !endKeyAddr.Equal(header.EndKey); l || r { if !l || !r { return false, emptySpan, errors.Errorf("local key mixed with global key in range") } local = true } if keyAddr.Less(rs.Key) { // rs.Key can't be local because it contains range split points, which // are never local. if !local { header.Key = rs.Key.AsRawKey() } else { // The local start key should be truncated to the boundary of local keys which // address to rs.Key. header.Key = keys.MakeRangeKeyPrefix(rs.Key) } } if !endKeyAddr.Less(rs.EndKey) { // rs.EndKey can't be local because it contains range split points, which // are never local. if !local { header.EndKey = rs.EndKey.AsRawKey() } else { // The local end key should be truncated to the boundary of local keys which // address to rs.EndKey. header.EndKey = keys.MakeRangeKeyPrefix(rs.EndKey) } } // Check whether the truncation has left any keys in the range. If not, // we need to cut it out of the request. if header.Key.Compare(header.EndKey) >= 0 { return false, emptySpan, nil } return true, header, nil } var numNoop int origRequests := ba.Requests ba.Requests = make([]roachpb.RequestUnion, len(ba.Requests)) for pos, arg := range origRequests { hasRequest, newHeader, err := truncateOne(arg.GetInner()) if !hasRequest { // We omit this one, i.e. replace it with a Noop. numNoop++ union := roachpb.RequestUnion{} union.MustSetInner(&noopRequest) ba.Requests[pos] = union } else { // Keep the old one. If we must adjust the header, must copy. if inner := origRequests[pos].GetInner(); newHeader.Equal(inner.Header()) { ba.Requests[pos] = origRequests[pos] } else { shallowCopy := inner.ShallowCopy() shallowCopy.SetHeader(newHeader) union := &ba.Requests[pos] // avoid operating on copy union.MustSetInner(shallowCopy) } } if err != nil { return roachpb.BatchRequest{}, 0, err } } return ba, len(ba.Requests) - numNoop, nil }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() ctx, cleanup := tracing.EnsureContext(ctx, ds.Tracer) defer cleanup() // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err), false } var br *roachpb.BatchResponse // Send the request to one range per iteration. for { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to for // each RPC individually). On RPC errors, there's no guarantee that // the request hasn't made its way to the target regardless of the // error; we'd like the second execution to be caught by the sequence // cache if that happens. There is a small chance that that we address // a range twice in this chunk (stale/suboptimal descriptors due to // splits/merges) which leads to a transaction retry. // TODO(tschottdorf): it's possible that if we don't evict from the // cache we could be in for a busy loop. ba.SetNewRequest() var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var evictToken evictionToken var needAnother bool var pErr *roachpb.Error var finished bool for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. log.Trace(ctx, "meta descriptor lookup") desc, needAnother, evictToken, pErr = ds.getDescriptors(rs, evictToken, isReverse) // getDescriptors may fail retryably if the first range isn't // available via Gossip. if pErr != nil { log.Trace(ctx, "range descriptor lookup failed: "+pErr.String()) if pErr.Retryable { if log.V(1) { log.Warning(pErr) } continue } break } else { log.Trace(ctx, "looked up range descriptor") } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool { if isReverse { // This approach is needed because rs.EndKey is exclusive. return desc.ContainsKeyRange(desc.StartKey, rs.EndKey) } return desc.ContainsKey(rs.Key) } if !includesFrontOfCurSpan(desc) { if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(ctx, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } log.Trace(ctx, fmt.Sprintf("reply error: %T", pErr.GetDetail())) // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } if tErr.CanRetry() { continue } case *roachpb.RangeNotFoundError: // Range descriptor might be out of date - evict it. This is // likely the result of a rebalance. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(tErr.SuggestedRange) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(replacements...); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } continue case *roachpb.NotLeaderError: newLeader := tErr.Leader if newLeader != nil { // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale range descriptor; // evict cache. if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } } } else { // If the new leader is unknown, we were talking to a // replica that is partitioned away from the majority. Our // range descriptor may be stale, so clear the cache. // // TODO(bdarnell): An unknown-leader error doesn't // necessarily mean our descriptor is stale. Ideally we // would treat these errors more like SendError: retry on // another node (at a lower level), and then if it reaches // this level then we know we've exhausted our options and // must clear the cache. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } newLeader = &roachpb.ReplicaDescriptor{} } // Next, cache the new leader. ds.updateLeaderCache(roachpb.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(tErr) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(tErr) } continue } } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false default: log.Fatal("exited retry loop with nil error but finished=false") } } ba.Txn.Update(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } if ba.MaxScanResults > 0 { // Count how many results we received. var numResults int64 for _, resp := range curReply.Responses { if cResp, ok := resp.GetInner().(roachpb.Countable); ok { numResults += cResp.Count() } } if numResults > ba.MaxScanResults { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxScanResults)) } ba.MaxScanResults -= numResults if ba.MaxScanResults == 0 { // We are done with this batch. Some requests might have NoopResponses; we must // replace them with empty responses of the proper type. for i, req := range ba.Requests { if _, ok := br.Responses[i].GetInner().(*roachpb.NoopResponse); !ok { continue } union := roachpb.ResponseUnion{} var reply roachpb.Response if _, ok := req.GetInner().(*roachpb.ScanRequest); ok { reply = &roachpb.ScanResponse{} } else { _ = req.GetInner().(*roachpb.ReverseScanRequest) reply = &roachpb.ReverseScanResponse{} } union.MustSetInner(reply) br.Responses[i] = union } return br, nil, false } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if br == nil { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*roachpb.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(roachpb.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). union := &ba.Requests[i] // avoid working on copy union.MustSetInner(&noopRequest) continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey, err = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key, err = next(ba, desc.EndKey) } if err != nil { return nil, roachpb.NewError(err), false } log.Trace(ctx, "querying next range") } }
// send runs the specified calls synchronously in a single batch and // returns any errors. If the transaction is read-only or has already // been successfully committed or aborted, a potential trailing // EndTransaction call is silently dropped, allowing the caller to // always commit or clean-up explicitly even when that may not be // required (or even erroneous). Returns (nil, nil) for an empty batch. func (txn *Txn) send(ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if txn.Proto.Status != roachpb.PENDING || txn.IsFinalized() { return nil, roachpb.NewErrorf( "attempting to use transaction with wrong status or finalized: %s", txn.Proto.Status) } // It doesn't make sense to use inconsistent reads in a transaction. However, // we still need to accept it as a parameter for this to compile. if ba.ReadConsistency != roachpb.CONSISTENT { return nil, roachpb.NewErrorf("cannot use %s ReadConsistency in txn", ba.ReadConsistency) } lastIndex := len(ba.Requests) - 1 if lastIndex < 0 { return nil, nil } // firstWriteIndex is set to the index of the first command which is // a transactional write. If != -1, this indicates an intention to // write. This is in contrast to txn.Proto.Writing, which is set by // the coordinator when the first intent has been created, and which // lives for the life of the transaction. firstWriteIndex := -1 var firstWriteKey roachpb.Key for i, ru := range ba.Requests { args := ru.GetInner() if i < lastIndex { if _, ok := args.(*roachpb.EndTransactionRequest); ok { return nil, roachpb.NewErrorf("%s sent as non-terminal call", args.Method()) } } if roachpb.IsTransactionWrite(args) && firstWriteIndex == -1 { firstWriteKey = args.Header().Key firstWriteIndex = i } } haveTxnWrite := firstWriteIndex != -1 endTxnRequest, haveEndTxn := ba.Requests[lastIndex].GetInner().(*roachpb.EndTransactionRequest) needBeginTxn := !txn.Proto.Writing && haveTxnWrite needEndTxn := txn.Proto.Writing || haveTxnWrite elideEndTxn := haveEndTxn && !needEndTxn // If we're not yet writing in this txn, but intend to, insert a // begin transaction request before the first write command. if needBeginTxn { // If the transaction already has a key (we're in a restart), make // sure we set the key in the begin transaction request to the original. bt := &roachpb.BeginTransactionRequest{ Span: roachpb.Span{ Key: firstWriteKey, }, } if txn.Proto.Key != nil { bt.Key = txn.Proto.Key } // Inject the new request before position firstWriteIndex, taking // care to avoid unnecessary allocations. oldRequests := ba.Requests ba.Requests = make([]roachpb.RequestUnion, len(ba.Requests)+1) copy(ba.Requests, oldRequests[:firstWriteIndex]) ba.Requests[firstWriteIndex].MustSetInner(bt) copy(ba.Requests[firstWriteIndex+1:], oldRequests[firstWriteIndex:]) } if elideEndTxn { ba.Requests = ba.Requests[:lastIndex] } br, pErr := txn.db.send(ba) if elideEndTxn && pErr == nil { // Check that read only transactions do not violate their deadline. This can NOT // happen since the txn deadline is normally updated when it is about to expire // or expired. We will just keep the code for safety (see TestReacquireLeaseOnRestart). if endTxnRequest.Deadline != nil { if endTxnRequest.Deadline.Less(txn.Proto.Timestamp) { return nil, roachpb.NewErrorWithTxn(roachpb.NewTransactionAbortedError(), &txn.Proto) } } // This normally happens on the server and sent back in response // headers, but this transaction was optimized away. The caller may // still inspect the transaction struct, so we manually update it // here to emulate a true transaction. if endTxnRequest.Commit { txn.Proto.Status = roachpb.COMMITTED } else { txn.Proto.Status = roachpb.ABORTED } txn.finalized = true } // If we inserted a begin transaction request, remove it here. if needBeginTxn { if br != nil && br.Responses != nil { br.Responses = append(br.Responses[:firstWriteIndex], br.Responses[firstWriteIndex+1:]...) } // Handle case where inserted begin txn confused an indexed error. if pErr != nil && pErr.Index != nil { idx := pErr.Index.Index if idx == int32(firstWriteIndex) { // An error was encountered on begin txn; disallow the indexing. pErr.Index = nil } else if idx > int32(firstWriteIndex) { // An error was encountered after begin txn; decrement index. pErr.SetErrorIndex(idx - 1) } } } return br, pErr }
// Send implements the batch.Sender interface. It subdivides // the Batch into batches admissible for sending (preventing certain // illegal mixtures of requests), executes each individual part // (which may span multiple ranges), and recombines the response. // When the request spans ranges, it is split up and the corresponding // ranges queried serially, in ascending order. // In particular, the first write in a transaction may not be part of the first // request sent. This is relevant since the first write is a BeginTransaction // request, thus opening up a window of time during which there may be intents // of a transaction, but no entry. Pushing such a transaction will succeed, and // may lead to the transaction being aborted early. func (ds *DistSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { tracing.AnnotateTrace() // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(roachpb.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil && len(ba.Txn.CertainNodes.Nodes) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). // // Taking care below to not modify any memory referenced from // our BatchRequest which may be shared with others. // First, get a shallow clone of our txn (since that holds the // NodeList struct). txnShallow := *ba.Txn // Next, zero out the NodeList pointer. That makes sure that // if we had something of size zero but with capacity, we don't // re-use the existing space (which others may also use). txnShallow.CertainNodes.Nodes = nil txnShallow.CertainNodes.Add(nDesc.NodeID) ba.Txn = &txnShallow } } if len(ba.Requests) < 1 { panic("empty batch") } var rplChunks []*roachpb.BatchResponse parts := ba.Split(false /* don't split ET */) for len(parts) > 0 { part := parts[0] ba.Requests = part rpl, pErr, shouldSplitET := ds.sendChunk(ctx, ba) if shouldSplitET { // If we tried to send a single round-trip EndTransaction but // it looks like it's going to hit multiple ranges, split it // here and try again. if len(parts) != 1 { panic("EndTransaction not in last chunk of batch") } parts = ba.Split(true /* split ET */) if len(parts) != 2 { panic("split of final EndTransaction chunk resulted in != 2 parts") } continue } if pErr != nil { return nil, pErr } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.Txn.Update(rpl.Header().Txn) rplChunks = append(rplChunks, rpl) parts = parts[1:] } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) } *reply.Header() = rplChunks[len(rplChunks)-1].BatchResponse_Header return reply, nil }
// truncate restricts all contained requests to the given key range // and returns a new BatchRequest. // All requests contained in that batch are "truncated" to the given // span, inserting NoopRequest appropriately to replace requests which // are left without a key range to operate on. The number of non-noop // requests after truncation is returned. func truncate(ba roachpb.BatchRequest, rs roachpb.RSpan) (roachpb.BatchRequest, int, error) { truncateOne := func(args roachpb.Request) (bool, roachpb.Span, error) { if _, ok := args.(*roachpb.NoopRequest); ok { return true, emptySpan, nil } header := *args.Header() if !roachpb.IsRange(args) { // This is a point request. if len(header.EndKey) > 0 { return false, emptySpan, util.Errorf("%T is not a range command, but EndKey is set", args) } if !rs.ContainsKey(keys.Addr(header.Key)) { return false, emptySpan, nil } return true, header, nil } // We're dealing with a range-spanning request. keyAddr, endKeyAddr := keys.Addr(header.Key), keys.Addr(header.EndKey) if l, r := !keyAddr.Equal(header.Key), !endKeyAddr.Equal(header.EndKey); l || r { if !rs.ContainsKeyRange(keyAddr, endKeyAddr) { return false, emptySpan, util.Errorf("local key range must not span ranges") } if !l || !r { return false, emptySpan, util.Errorf("local key mixed with global key in range") } // Range-local local key range. return true, header, nil } // Below, {end,}keyAddr equals header.{End,}Key, so nothing is local. if keyAddr.Less(rs.Key) { header.Key = rs.Key.AsRawKey() // "key" can't be local keyAddr = rs.Key } if !endKeyAddr.Less(rs.EndKey) { header.EndKey = rs.EndKey.AsRawKey() // "endKey" can't be local endKeyAddr = rs.EndKey } // Check whether the truncation has left any keys in the range. If not, // we need to cut it out of the request. if !keyAddr.Less(endKeyAddr) { return false, emptySpan, nil } return true, header, nil } var numNoop int origRequests := ba.Requests ba.Requests = make([]roachpb.RequestUnion, len(ba.Requests)) for pos, arg := range origRequests { hasRequest, newHeader, err := truncateOne(arg.GetInner()) if !hasRequest { // We omit this one, i.e. replace it with a Noop. numNoop++ nReq := roachpb.RequestUnion{} if !nReq.SetValue(&roachpb.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } ba.Requests[pos] = nReq } else { // Keep the old one. If we must adjust the header, must copy. // TODO(tschottdorf): this could wind up cloning big chunks of data. // Can optimize by creating a new Request manually, but with the old // data. if newHeader.Equal(*origRequests[pos].GetInner().Header()) { ba.Requests[pos] = origRequests[pos] } else { ba.Requests[pos] = *proto.Clone(&origRequests[pos]).(*roachpb.RequestUnion) *ba.Requests[pos].GetInner().Header() = newHeader } } if err != nil { return roachpb.BatchRequest{}, 0, err } } return ba, len(ba.Requests) - numNoop, nil }
// truncate restricts all contained requests to the given key range // and returns a new BatchRequest. // All requests contained in that batch are "truncated" to the given // span, inserting NoopRequest appropriately to replace requests which // are left without a key range to operate on. The number of non-noop // requests after truncation is returned. func truncate(ba roachpb.BatchRequest, rs roachpb.RSpan) (roachpb.BatchRequest, int, error) { truncateOne := func(args roachpb.Request) (bool, roachpb.Span, error) { if _, ok := args.(*roachpb.NoopRequest); ok { return true, emptySpan, nil } header := args.Header() if !roachpb.IsRange(args) { // This is a point request. if len(header.EndKey) > 0 { return false, emptySpan, util.Errorf("%T is not a range command, but EndKey is set", args) } if !rs.ContainsKey(keys.Addr(header.Key)) { return false, emptySpan, nil } return true, header, nil } // We're dealing with a range-spanning request. keyAddr, endKeyAddr := keys.Addr(header.Key), keys.Addr(header.EndKey) if l, r := !keyAddr.Equal(header.Key), !endKeyAddr.Equal(header.EndKey); l || r { if !rs.ContainsKeyRange(keyAddr, endKeyAddr) { return false, emptySpan, util.Errorf("local key range must not span ranges") } if !l || !r { return false, emptySpan, util.Errorf("local key mixed with global key in range") } // Range-local local key range. return true, header, nil } // Below, {end,}keyAddr equals header.{End,}Key, so nothing is local. if keyAddr.Less(rs.Key) { header.Key = rs.Key.AsRawKey() // "key" can't be local keyAddr = rs.Key } if !endKeyAddr.Less(rs.EndKey) { header.EndKey = rs.EndKey.AsRawKey() // "endKey" can't be local endKeyAddr = rs.EndKey } // Check whether the truncation has left any keys in the range. If not, // we need to cut it out of the request. if !keyAddr.Less(endKeyAddr) { return false, emptySpan, nil } return true, header, nil } var numNoop int origRequests := ba.Requests ba.Requests = make([]roachpb.RequestUnion, len(ba.Requests)) for pos, arg := range origRequests { hasRequest, newHeader, err := truncateOne(arg.GetInner()) if !hasRequest { // We omit this one, i.e. replace it with a Noop. numNoop++ union := roachpb.RequestUnion{} if !union.SetInner(&noopRequest) { panic(fmt.Sprintf("%T excludes %T", union, noopRequest)) } ba.Requests[pos] = union } else { // Keep the old one. If we must adjust the header, must copy. if inner := origRequests[pos].GetInner(); newHeader.Equal(inner.Header()) { ba.Requests[pos] = origRequests[pos] } else { shallowCopy := inner.ShallowCopy() shallowCopy.SetHeader(newHeader) if union := &ba.Requests[pos]; !union.SetInner(shallowCopy) { panic(fmt.Sprintf("%T excludes %T", union, shallowCopy)) } } } if err != nil { return roachpb.BatchRequest{}, 0, err } } return ba, len(ba.Requests) - numNoop, nil }