// TransferRangeLease transfers the lease for a range from whoever has it to // a particular store. That store must already have a replica of the range. If // that replica already has the (active) lease, this method is a no-op. func (tc *TestCluster) TransferRangeLease( rangeDesc *roachpb.RangeDescriptor, dest ReplicationTarget, ) error { destReplicaDesc, ok := rangeDesc.GetReplicaDescriptor(dest.StoreID) if !ok { log.Fatalf("Couldn't find store %d in range %+v", dest.StoreID, rangeDesc) } leaseHolderDesc, err := tc.FindRangeLeaseHolder(rangeDesc, &ReplicationTarget{ NodeID: destReplicaDesc.NodeID, StoreID: destReplicaDesc.StoreID, }) if err != nil { return err } if leaseHolderDesc.StoreID == dest.StoreID { // The intended replica already has the lease. Nothing to do. return nil } oldStore, err := tc.findMemberStore(leaseHolderDesc.StoreID) if err != nil { return err } oldReplica, err := oldStore.GetReplica(rangeDesc.RangeID) if err != nil { return err } // Ask the lease holder to transfer the lease. if err := oldReplica.AdminTransferLease(destReplicaDesc); err != nil { return err } return nil }
// TestSendRPCRetry verifies that sendRPC failed on first address but succeed on // second address, the second reply should be successfully returned back. func TestSendRPCRetry(t *testing.T) { defer leaktest.AfterTest(t) g, s := makeTestGossip(t) defer s() g.SetNodeID(1) if err := g.SetNodeDescriptor(&roachpb.NodeDescriptor{NodeID: 1}); err != nil { t.Fatal(err) } // Fill RangeDescriptor with 2 replicas var descriptor = roachpb.RangeDescriptor{ RangeID: 1, StartKey: roachpb.RKey("a"), EndKey: roachpb.RKey("z"), } for i := 1; i <= 2; i++ { addr := util.MakeUnresolvedAddr("tcp", fmt.Sprintf("node%d", i)) nd := &roachpb.NodeDescriptor{ NodeID: roachpb.NodeID(i), Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()), } if err := g.AddInfoProto(gossip.MakeNodeIDKey(roachpb.NodeID(i)), nd, time.Hour); err != nil { t.Fatal(err) } descriptor.Replicas = append(descriptor.Replicas, roachpb.ReplicaDescriptor{ NodeID: roachpb.NodeID(i), StoreID: roachpb.StoreID(i), }) } // Define our rpcSend stub which returns success on the second address. var testFn rpcSendFn = func(_ rpc.Options, method string, addrs []net.Addr, getArgs func(addr net.Addr) proto.Message, getReply func() proto.Message, _ *rpc.Context) ([]proto.Message, error) { if method == "Node.Batch" { // reply from first address failed _ = getReply() // reply from second address succeed batchReply := getReply().(*roachpb.BatchResponse) reply := &roachpb.ScanResponse{} batchReply.Add(reply) reply.Rows = append([]roachpb.KeyValue{}, roachpb.KeyValue{Key: roachpb.Key("b"), Value: roachpb.Value{}}) return []proto.Message{batchReply}, nil } return nil, util.Errorf("unexpected method %v", method) } ctx := &DistSenderContext{ RPCSend: testFn, RangeDescriptorDB: mockRangeDescriptorDB(func(_ roachpb.RKey, _, _ bool) ([]roachpb.RangeDescriptor, *roachpb.Error) { return []roachpb.RangeDescriptor{descriptor}, nil }), } ds := NewDistSender(ctx, g) scan := roachpb.NewScan(roachpb.Key("a"), roachpb.Key("d"), 1) sr, err := client.SendWrapped(ds, nil, scan) if err != nil { t.Fatal(err) } if l := len(sr.(*roachpb.ScanResponse).Rows); l != 1 { t.Fatalf("expected 1 row; got %d", l) } }
// intersect returns the intersection of the current span and the // descriptor's range. func (rs rSpan) intersect(desc *roachpb.RangeDescriptor) rSpan { key := rs.key if !desc.ContainsKey(key) { key = desc.StartKey } endKey := rs.endKey if !desc.ContainsKeyRange(desc.StartKey, endKey) || endKey == nil { endKey = desc.EndKey } return rSpan{key, endKey} }
// TestSendRPCRetry verifies that sendRPC failed on first address but succeed on // second address, the second reply should be successfully returned back. func TestSendRPCRetry(t *testing.T) { defer leaktest.AfterTest(t) g, s := makeTestGossip(t) defer s() g.SetNodeID(1) if err := g.SetNodeDescriptor(&roachpb.NodeDescriptor{NodeID: 1}); err != nil { t.Fatal(err) } // Fill RangeDescriptor with 2 replicas var descriptor = roachpb.RangeDescriptor{ RangeID: 1, StartKey: roachpb.RKey("a"), EndKey: roachpb.RKey("z"), } for i := 1; i <= 2; i++ { addr := util.MakeUnresolvedAddr("tcp", fmt.Sprintf("node%d", i)) nd := &roachpb.NodeDescriptor{ NodeID: roachpb.NodeID(i), Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()), } if err := g.AddInfoProto(gossip.MakeNodeIDKey(roachpb.NodeID(i)), nd, time.Hour); err != nil { t.Fatal(err) } descriptor.Replicas = append(descriptor.Replicas, roachpb.ReplicaDescriptor{ NodeID: roachpb.NodeID(i), StoreID: roachpb.StoreID(i), }) } var testFn rpcSendFn = func(_ SendOptions, _ ReplicaSlice, args roachpb.BatchRequest, _ *rpc.Context) (proto.Message, error) { batchReply := &roachpb.BatchResponse{} reply := &roachpb.ScanResponse{} batchReply.Add(reply) reply.Rows = append([]roachpb.KeyValue{}, roachpb.KeyValue{Key: roachpb.Key("b"), Value: roachpb.Value{}}) return batchReply, nil } ctx := &DistSenderContext{ RPCSend: testFn, RangeDescriptorDB: mockRangeDescriptorDB(func(_ roachpb.RKey, _, _ bool) ([]roachpb.RangeDescriptor, *roachpb.Error) { return []roachpb.RangeDescriptor{descriptor}, nil }), } ds := NewDistSender(ctx, g) scan := roachpb.NewScan(roachpb.Key("a"), roachpb.Key("d"), 1) sr, err := client.SendWrapped(ds, nil, scan) if err != nil { t.Fatal(err) } if l := len(sr.(*roachpb.ScanResponse).Rows); l != 1 { t.Fatalf("expected 1 row; got %d", l) } }
// FindRangeLeaseHolder returns the current lease holder for the given range. If // there is no lease at the time of the call, a replica is gets one as a // side-effect of calling this; if hint is not nil, that replica will be the // one. // // One of the Stores in the cluster is used as a Sender to send a dummy read // command, which will either result in success (if a replica on that Node has // the lease), in a NotLeaseHolderError pointing to the current lease holder (if // there is an active lease), or in the replica on that store acquiring the // lease (if there isn't an active lease). // If an active lease existed for the range, it's extended as a side-effect. func (tc *TestCluster) FindRangeLeaseHolder( rangeDesc *roachpb.RangeDescriptor, hint *ReplicationTarget, ) (ReplicationTarget, error) { var hintReplicaDesc roachpb.ReplicaDescriptor if hint != nil { var ok bool if hintReplicaDesc, ok = rangeDesc.GetReplicaDescriptor(hint.StoreID); !ok { return ReplicationTarget{}, errors.Errorf( "bad hint; store doesn't have a replica of the range") } } else { hint = &ReplicationTarget{ NodeID: rangeDesc.Replicas[0].NodeID, StoreID: rangeDesc.Replicas[0].StoreID} hintReplicaDesc = rangeDesc.Replicas[0] } // TODO(andrei): Using a dummy GetRequest for the purpose of figuring out the // lease holder is a hack. Instead, we should have a dedicate admin command. getReq := roachpb.GetRequest{ Span: roachpb.Span{ Key: rangeDesc.StartKey.AsRawKey(), }, } store, err := tc.findMemberStore(hint.StoreID) if err != nil { return ReplicationTarget{}, err } _, pErr := client.SendWrappedWith( store, nil, roachpb.Header{RangeID: rangeDesc.RangeID, Replica: hintReplicaDesc}, &getReq) if pErr != nil { if nle, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); ok { if nle.LeaseHolder == nil { return ReplicationTarget{}, errors.Errorf( "unexpected NotLeaseHolderError with lease holder unknown") } return ReplicationTarget{ NodeID: nle.LeaseHolder.NodeID, StoreID: nle.LeaseHolder.StoreID}, nil } return ReplicationTarget{}, pErr.GoError() } // The replica we sent the request to either was already or just became // the lease holder. return *hint, nil }
// TestSendRPCOrder verifies that sendRPC correctly takes into account the // leader, attributes and required consistency to determine where to send // remote requests. func TestSendRPCOrder(t *testing.T) { defer leaktest.AfterTest(t) g, s := makeTestGossip(t) defer s() rangeID := roachpb.RangeID(99) nodeAttrs := map[int32][]string{ 1: {}, // The local node, set in each test case. 2: {"us", "west", "gpu"}, 3: {"eu", "dublin", "pdu2", "gpu"}, 4: {"us", "east", "gpu"}, 5: {"us", "east", "gpu", "flaky"}, } // Gets filled below to identify the replica by its address. addrToNode := make(map[string]int32) makeVerifier := func(expOrder rpc.OrderingPolicy, expAddrs []int32) func(rpc.Options, []net.Addr) error { return func(o rpc.Options, addrs []net.Addr) error { if o.Ordering != expOrder { return util.Errorf("unexpected ordering, wanted %v, got %v", expOrder, o.Ordering) } var actualAddrs []int32 for i, a := range addrs { if len(expAddrs) <= i { return util.Errorf("got unexpected address: %s", a) } if expAddrs[i] == 0 { actualAddrs = append(actualAddrs, 0) } else { actualAddrs = append(actualAddrs, addrToNode[a.String()]) } } if !reflect.DeepEqual(expAddrs, actualAddrs) { return util.Errorf("expected %d, but found %d", expAddrs, actualAddrs) } return nil } } testCases := []struct { args roachpb.Request attrs []string order rpc.OrderingPolicy expReplica []int32 leader int32 // 0 for not caching a leader. // Naming is somewhat off, as eventually consistent reads usually // do not have to go to the leader when a node has a read lease. // Would really want CONSENSUS here, but that is not implemented. // Likely a test setup here will never have a read lease, but good // to keep in mind. consistent bool }{ // Inconsistent Scan without matching attributes. { args: &roachpb.ScanRequest{}, attrs: []string{}, order: rpc.OrderRandom, expReplica: []int32{1, 2, 3, 4, 5}, }, // Inconsistent Scan with matching attributes. // Should move the two nodes matching the attributes to the front and // go stable. { args: &roachpb.ScanRequest{}, attrs: nodeAttrs[5], order: rpc.OrderStable, // Compare only the first two resulting addresses. expReplica: []int32{5, 4, 0, 0, 0}, }, // Scan without matching attributes that requires but does not find // a leader. { args: &roachpb.ScanRequest{}, attrs: []string{}, order: rpc.OrderRandom, expReplica: []int32{1, 2, 3, 4, 5}, consistent: true, }, // Put without matching attributes that requires but does not find leader. // Should go random and not change anything. { args: &roachpb.PutRequest{}, attrs: []string{"nomatch"}, order: rpc.OrderRandom, expReplica: []int32{1, 2, 3, 4, 5}, }, // Put with matching attributes but no leader. // Should move the two nodes matching the attributes to the front and // go stable. { args: &roachpb.PutRequest{}, attrs: append(nodeAttrs[5], "irrelevant"), // Compare only the first two resulting addresses. order: rpc.OrderStable, expReplica: []int32{5, 4, 0, 0, 0}, }, // Put with matching attributes that finds the leader (node 3). // Should address the leader and the two nodes matching the attributes // (the last and second to last) in that order. { args: &roachpb.PutRequest{}, attrs: append(nodeAttrs[5], "irrelevant"), // Compare only the first resulting addresses as we have a leader // and that means we're only trying to send there. order: rpc.OrderStable, expReplica: []int32{2, 5, 4, 0, 0}, leader: 2, }, // Inconsistent Get without matching attributes but leader (node 3). Should just // go random as the leader does not matter. { args: &roachpb.GetRequest{}, attrs: []string{}, order: rpc.OrderRandom, expReplica: []int32{1, 2, 3, 4, 5}, leader: 2, }, } descriptor := roachpb.RangeDescriptor{ StartKey: roachpb.RKeyMin, EndKey: roachpb.RKeyMax, RangeID: rangeID, Replicas: nil, } // Stub to be changed in each test case. var verifyCall func(rpc.Options, []net.Addr) error var testFn rpcSendFn = func(opts rpc.Options, method string, addrs []net.Addr, getArgs func(addr net.Addr) proto.Message, getReply func() proto.Message, _ *rpc.Context) ([]proto.Message, error) { if err := verifyCall(opts, addrs); err != nil { return nil, err } return []proto.Message{getArgs(addrs[0]).(*roachpb.BatchRequest).CreateReply()}, nil } ctx := &DistSenderContext{ RPCSend: testFn, RangeDescriptorDB: mockRangeDescriptorDB(func(roachpb.RKey, bool, bool) ([]roachpb.RangeDescriptor, *roachpb.Error) { return []roachpb.RangeDescriptor{descriptor}, nil }), } ds := NewDistSender(ctx, g) for n, tc := range testCases { verifyCall = makeVerifier(tc.order, tc.expReplica) descriptor.Replicas = nil // could do this once above, but more convenient here for i := int32(1); i <= 5; i++ { addr := util.MakeUnresolvedAddr("tcp", fmt.Sprintf("node%d", i)) addrToNode[addr.String()] = i nd := &roachpb.NodeDescriptor{ NodeID: roachpb.NodeID(i), Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()), Attrs: roachpb.Attributes{ Attrs: nodeAttrs[i], }, } if err := g.AddInfoProto(gossip.MakeNodeIDKey(roachpb.NodeID(i)), nd, time.Hour); err != nil { t.Fatal(err) } descriptor.Replicas = append(descriptor.Replicas, roachpb.ReplicaDescriptor{ NodeID: roachpb.NodeID(i), StoreID: roachpb.StoreID(i), }) } { // The local node needs to get its attributes during sendRPC. nd := &roachpb.NodeDescriptor{ NodeID: 6, Attrs: roachpb.Attributes{ Attrs: tc.attrs, }, } g.SetNodeID(nd.NodeID) if err := g.SetNodeDescriptor(nd); err != nil { t.Fatal(err) } } ds.leaderCache.Update(roachpb.RangeID(rangeID), roachpb.ReplicaDescriptor{}) if tc.leader > 0 { ds.leaderCache.Update(roachpb.RangeID(rangeID), descriptor.Replicas[tc.leader-1]) } args := tc.args args.Header().Key = roachpb.Key("a") if roachpb.IsRange(args) { args.Header().EndKey = roachpb.Key("b") } consistency := roachpb.CONSISTENT if !tc.consistent { consistency = roachpb.INCONSISTENT } // Kill the cached NodeDescriptor, enforcing a lookup from Gossip. ds.nodeDescriptor = nil if _, err := client.SendWrappedWith(ds, nil, roachpb.Header{ RangeID: rangeID, // Not used in this test, but why not. ReadConsistency: consistency, }, args); err != nil { t.Errorf("%d: %s", n, err) } } }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() trace := tracer.FromCtx(ctx) // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs := keys.Range(ba) var br *roachpb.BatchResponse // Send the request to one range per iteration. for { considerIntents := false var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var needAnother bool var pErr *roachpb.Error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") var evictDesc func() desc, needAnother, evictDesc, pErr = ds.getDescriptors(rs, considerIntents, isReverse) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if pErr != nil { if pErr.Retryable { if log.V(1) { log.Warning(pErr) } continue } break } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. if (isReverse && !desc.ContainsKeyRange(desc.StartKey, rs.EndKey)) || (!isReverse && !desc.ContainsKeyRange(rs.Key, desc.EndKey)) { evictDesc() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(trace, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { break } if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } trace.Event(fmt.Sprintf("reply error: %T", pErr.GoError())) // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := pErr.GoError().(type) { case *roachpb.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. // TODO(tschottdorf): If a replica group goes dead, this // will cause clients to put high read pressure on the first // range, so there should be some rate limiting here. evictDesc() if tErr.CanRetry() { continue } case *roachpb.RangeNotFoundError, *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. evictDesc() // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } // On retries, allow [uncommitted] intents on range descriptor // lookups to be returned 50% of the time in order to succeed // at finding the transaction record pointed to by the intent // itself. The 50% probability of returning either the current // intent or the previously committed value balances between // the two cases where the intent's txn hasn't yet been // committed (the previous value is correct), or the intent's // txn has been committed (the intent value is correct). considerIntents = true continue case *roachpb.NotLeaderError: newLeader := tErr.Leader // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } evictDesc() } } else { newLeader = &roachpb.ReplicaDescriptor{} } ds.updateLeaderCache(roachpb.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(tErr) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(tErr) } continue } } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } ba.Txn.Update(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if br == nil { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*roachpb.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(roachpb.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). ba.Requests[i].Reset() // necessary (no one-of?) if !ba.Requests[i].SetValue(&roachpb.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key = next(ba, desc.EndKey) } trace.Event("querying next range") } }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() // TODO(radu): when contexts are properly plumbed, we should be able to get // the tracer from ctx, not from the DistSender. ctx, cleanup := tracing.EnsureContext(ctx, tracing.TracerFromCtx(ds.Ctx)) defer cleanup() // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err), false } var br *roachpb.BatchResponse // Send the request to one range per iteration. for { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to for // each RPC individually). On RPC errors, there's no guarantee that // the request hasn't made its way to the target regardless of the // error; we'd like the second execution to be caught by the sequence // cache if that happens. There is a small chance that that we address // a range twice in this chunk (stale/suboptimal descriptors due to // splits/merges) which leads to a transaction retry. // TODO(tschottdorf): it's possible that if we don't evict from the // cache we could be in for a busy loop. ba.SetNewRequest() var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var evictToken *evictionToken var needAnother bool var pErr *roachpb.Error var finished bool var numAttempts int for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { numAttempts++ { const magicLogCurAttempt = 20 var seq int32 if ba.Txn != nil { seq = ba.Txn.Sequence } if numAttempts%magicLogCurAttempt == 0 || seq%magicLogCurAttempt == 0 { // Log a message if a request appears to get stuck for a long // time or, potentially, forever. See #8975. // The local counter captures this loop here; the Sequence number // should capture anything higher up (as it needs to be // incremented every time this method is called). log.Warningf( ctx, "%d retries for an RPC at sequence %d, last error was: %s, remaining key ranges %s: %s", numAttempts, seq, pErr, rs, ba, ) } } // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. log.Trace(ctx, "meta descriptor lookup") var err error desc, needAnother, evictToken, err = ds.getDescriptors(ctx, rs, evictToken, isReverse) // getDescriptors may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.Trace(ctx, "range descriptor lookup failed: "+err.Error()) if log.V(1) { log.Warning(ctx, err) } pErr = roachpb.NewError(err) continue } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool { if isReverse { return desc.ContainsExclusiveEndKey(rs.EndKey) } return desc.ContainsKey(rs.Key) } if !includesFrontOfCurSpan(desc) { if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(ctx, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } log.VTracef(1, ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(tErr.SuggestedRange) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(ctx, tErr) } continue } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false case <-ctx.Done(): return nil, roachpb.NewError(ctx.Err()), false default: log.Fatal(ctx, "exited retry loop with nil error but finished=false") } } ba.UpdateTxn(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey, err = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key, err = next(ba, desc.EndKey) } if err != nil { return nil, roachpb.NewError(err), false } if ba.MaxSpanRequestKeys > 0 { // Count how many results we received. var numResults int64 for _, resp := range curReply.Responses { numResults += resp.GetInner().Header().NumKeys } if numResults > ba.MaxSpanRequestKeys { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxSpanRequestKeys)) } ba.MaxSpanRequestKeys -= numResults if ba.MaxSpanRequestKeys == 0 { // prepare the batch response after meeting the max key limit. fillSkippedResponses(ba, br, rs) // done, exit loop. return br, nil, false } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } // key cannot be less that the end key. if !rs.Key.Less(rs.EndKey) { panic(fmt.Sprintf("start key %s is less than %s", rs.Key, rs.EndKey)) } log.Trace(ctx, "querying next range") } }
func testRangeCacheHandleDoubleSplit(t *testing.T, useReverseScan bool) { db := initTestDescriptorDB(t) db.disablePrefetch = true // A request initially looks up the range descriptor ["a"-"b"). doLookup(t, db.cache, "aa") db.assertLookupCountEq(t, 2, "aa") // A split breaks up the range into ["a"-"an"), ["an"-"at"), ["at"-"b"). db.splitRange(t, roachpb.RKey("an")) db.splitRange(t, roachpb.RKey("at")) // A request is sent to the stale descriptor on the right half // such that a RangeKeyMismatchError is returned. _, evictToken := doLookup(t, db.cache, "az") // mismatchErrRange mocks out a RangeKeyMismatchError.Range response. ranges, _, pErr := db.getDescriptors(roachpb.RKey("aa"), false, false) if pErr != nil { t.Fatal(pErr) } mismatchErrRange := ranges[0] // The stale descriptor is evicted, the new descriptor from the error is // replaced, and a new lookup is initialized. if err := evictToken.EvictAndReplace(context.Background(), mismatchErrRange); err != nil { t.Fatal(err) } // Requests to all parts of the split are sent: // [reverse case] // - "aa" and "an" will hit the cache // - all others will join a coalesced request to "az" // + will lookup the meta2 desc // + will lookup the ["at"-"b") desc // - "az" will get the right range back // - "at" will make a second lookup // + will lookup the ["an"-"at") desc // // [non-reverse case] // - "aa" will hit the cache // - all others will join a coalesced request to "an" // + will lookup the meta2 desc // + will lookup the ["an"-"at") desc // - "an" and "ao" will get the right range back // - "at" and "az" will make a second lookup // + will lookup the ["at"-"b") desc var wg, waitJoin sync.WaitGroup db.pauseRangeLookups() numRetries := int64(0) for _, k := range []string{"aa", "an", "ao", "at", "az"} { wg.Add(1) waitJoin.Add(1) go func(key roachpb.RKey) { reqEvictToken := evictToken waitJoinCopied := &waitJoin var desc *roachpb.RangeDescriptor for { // Each request goes to a different key. var err error if desc, reqEvictToken, err = db.cache.lookupRangeDescriptorInternal( context.Background(), key, reqEvictToken, false, /* considerIntents */ useReverseScan, waitJoinCopied); err != nil { waitJoinCopied = nil atomic.AddInt64(&numRetries, 1) continue } break } if useReverseScan { if !desc.ContainsExclusiveEndKey(key) { t.Errorf("desc %s does not contain exclusive end key %s", desc, key) } } else { if !desc.ContainsKey(key) { t.Errorf("desc %s does not contain key %s", desc, key) } } wg.Done() }(roachpb.RKey(k)) } // Wait until all lookup requests hit the cache or join into a coalesced request. waitJoin.Wait() db.resumeRangeLookups() wg.Wait() db.assertLookupCountEq(t, 3, "an and az") if numRetries == 0 { t.Error("expected retry on desc lookup") } // All three descriptors are now correctly cached. doLookup(t, db.cache, "aa") db.assertLookupCountEq(t, 0, "aa") doLookup(t, db.cache, "ao") db.assertLookupCountEq(t, 0, "ao") doLookup(t, db.cache, "az") db.assertLookupCountEq(t, 0, "az") }
// truncate restricts all contained requests to the given key range. // Even on error, the returned closure must be executed; it undoes any // truncations performed. // First, the boundaries of the truncation are obtained: This is the // intersection between [from,to) and the descriptor's range. // Secondly, all requests contained in the batch are "truncated" to // the resulting range, inserting NoopRequest appropriately to // replace requests which are left without a key range to operate on. // The number of non-noop requests after truncation is returned along // with a closure which must be executed to undo the truncation, even // in case of an error. // TODO(tschottdorf): Consider returning a new BatchRequest, which has more // overhead in the common case of a batch which never needs truncation but is // less magical. func truncate(br *roachpb.BatchRequest, desc *roachpb.RangeDescriptor, from, to roachpb.RKey) (func(), int, error) { if !desc.ContainsKey(from) { from = desc.StartKey } if !desc.ContainsKeyRange(desc.StartKey, to) || to == nil { to = desc.EndKey } truncateOne := func(args roachpb.Request) (bool, []func(), error) { if _, ok := args.(*roachpb.NoopRequest); ok { return true, nil, nil } header := args.Header() if !roachpb.IsRange(args) { // This is a point request. if len(header.EndKey) > 0 { return false, nil, util.Errorf("%T is not a range command, but EndKey is set", args) } if !desc.ContainsKey(keys.Addr(header.Key)) { return true, nil, nil } return false, nil, nil } // We're dealing with a range-spanning request. var undo []func() keyAddr, endKeyAddr := keys.Addr(header.Key), keys.Addr(header.EndKey) if l, r := !keyAddr.Equal(header.Key), !endKeyAddr.Equal(header.EndKey); l || r { if !desc.ContainsKeyRange(keyAddr, endKeyAddr) { return false, nil, util.Errorf("local key range must not span ranges") } if !l || !r { return false, nil, util.Errorf("local key mixed with global key in range") } } // Below, {end,}keyAddr equals header.{End,}Key, so nothing is local. if keyAddr.Less(from) { { origKey := header.Key undo = append(undo, func() { header.Key = origKey }) } header.Key = from.AsRawKey() // "from" can't be local keyAddr = from } if !endKeyAddr.Less(to) { { origEndKey := header.EndKey undo = append(undo, func() { header.EndKey = origEndKey }) } header.EndKey = to.AsRawKey() // "to" can't be local endKeyAddr = to } // Check whether the truncation has left any keys in the range. If not, // we need to cut it out of the request. return !keyAddr.Less(endKeyAddr), undo, nil } var fns []func() gUndo := func() { for _, f := range fns { f() } } var numNoop int for pos, arg := range br.Requests { omit, undo, err := truncateOne(arg.GetInner()) if omit { numNoop++ nReq := &roachpb.RequestUnion{} if !nReq.SetValue(&roachpb.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } oReq := br.Requests[pos] br.Requests[pos] = *nReq posCpy := pos // for closure undo = append(undo, func() { br.Requests[posCpy] = oReq }) } fns = append(fns, undo...) if err != nil { return gUndo, 0, err } } return gUndo, len(br.Requests) - numNoop, nil }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() ctx, cleanup := tracing.EnsureContext(ctx, ds.Tracer) defer cleanup() // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err), false } var br *roachpb.BatchResponse // Send the request to one range per iteration. for { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to for // each RPC individually). On RPC errors, there's no guarantee that // the request hasn't made its way to the target regardless of the // error; we'd like the second execution to be caught by the sequence // cache if that happens. There is a small chance that that we address // a range twice in this chunk (stale/suboptimal descriptors due to // splits/merges) which leads to a transaction retry. // TODO(tschottdorf): it's possible that if we don't evict from the // cache we could be in for a busy loop. ba.SetNewRequest() var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var evictToken evictionToken var needAnother bool var pErr *roachpb.Error var finished bool for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. log.Trace(ctx, "meta descriptor lookup") desc, needAnother, evictToken, pErr = ds.getDescriptors(rs, evictToken, isReverse) // getDescriptors may fail retryably if the first range isn't // available via Gossip. if pErr != nil { log.Trace(ctx, "range descriptor lookup failed: "+pErr.String()) if pErr.Retryable { if log.V(1) { log.Warning(pErr) } continue } break } else { log.Trace(ctx, "looked up range descriptor") } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool { if isReverse { // This approach is needed because rs.EndKey is exclusive. return desc.ContainsKeyRange(desc.StartKey, rs.EndKey) } return desc.ContainsKey(rs.Key) } if !includesFrontOfCurSpan(desc) { if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(ctx, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } log.Trace(ctx, fmt.Sprintf("reply error: %T", pErr.GetDetail())) // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } if tErr.CanRetry() { continue } case *roachpb.RangeNotFoundError: // Range descriptor might be out of date - evict it. This is // likely the result of a rebalance. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(tErr.SuggestedRange) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(replacements...); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } continue case *roachpb.NotLeaderError: newLeader := tErr.Leader if newLeader != nil { // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale range descriptor; // evict cache. if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } } } else { // If the new leader is unknown, we were talking to a // replica that is partitioned away from the majority. Our // range descriptor may be stale, so clear the cache. // // TODO(bdarnell): An unknown-leader error doesn't // necessarily mean our descriptor is stale. Ideally we // would treat these errors more like SendError: retry on // another node (at a lower level), and then if it reaches // this level then we know we've exhausted our options and // must clear the cache. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } newLeader = &roachpb.ReplicaDescriptor{} } // Next, cache the new leader. ds.updateLeaderCache(roachpb.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(tErr) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(tErr) } continue } } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false default: log.Fatal("exited retry loop with nil error but finished=false") } } ba.Txn.Update(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } if ba.MaxScanResults > 0 { // Count how many results we received. var numResults int64 for _, resp := range curReply.Responses { if cResp, ok := resp.GetInner().(roachpb.Countable); ok { numResults += cResp.Count() } } if numResults > ba.MaxScanResults { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxScanResults)) } ba.MaxScanResults -= numResults if ba.MaxScanResults == 0 { // We are done with this batch. Some requests might have NoopResponses; we must // replace them with empty responses of the proper type. for i, req := range ba.Requests { if _, ok := br.Responses[i].GetInner().(*roachpb.NoopResponse); !ok { continue } union := roachpb.ResponseUnion{} var reply roachpb.Response if _, ok := req.GetInner().(*roachpb.ScanRequest); ok { reply = &roachpb.ScanResponse{} } else { _ = req.GetInner().(*roachpb.ReverseScanRequest) reply = &roachpb.ReverseScanResponse{} } union.MustSetInner(reply) br.Responses[i] = union } return br, nil, false } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if br == nil { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*roachpb.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(roachpb.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). union := &ba.Requests[i] // avoid working on copy union.MustSetInner(&noopRequest) continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey, err = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key, err = next(ba, desc.EndKey) } if err != nil { return nil, roachpb.NewError(err), false } log.Trace(ctx, "querying next range") } }
// truncate restricts all contained requests to the given key range. // Even on error, the returned closure must be executed; it undoes any // truncations performed. // First, the boundaries of the truncation are obtained: This is the // intersection between [from,to) and the descriptor's range. // Secondly, all requests contained in the batch are "truncated" to // the resulting range, inserting NoopRequest appropriately to // replace requests which are left without a key range to operate on. // The number of non-noop requests after truncation is returned along // with a closure which must be executed to undo the truncation, even // in case of an error. // TODO(tschottdorf): Consider returning a new BatchRequest, which has more // overhead in the common case of a batch which never needs truncation but is // less magical. func truncate(br *roachpb.BatchRequest, desc *roachpb.RangeDescriptor, from, to roachpb.Key) (func(), int, error) { if !desc.ContainsKey(from) { from = desc.StartKey } if !desc.ContainsKeyRange(desc.StartKey, to) || to == nil { to = desc.EndKey } truncateOne := func(args roachpb.Request) (bool, []func(), error) { if _, ok := args.(*roachpb.NoopRequest); ok { return true, nil, nil } header := args.Header() if !roachpb.IsRange(args) { if len(header.EndKey) > 0 { return false, nil, util.Errorf("%T is not a range command, but EndKey is set", args) } if !desc.ContainsKey(keys.KeyAddress(header.Key)) { return true, nil, nil } return false, nil, nil } var undo []func() key, endKey := header.Key, header.EndKey keyAddr, endKeyAddr := keys.KeyAddress(key), keys.KeyAddress(endKey) if keyAddr.Less(from) { undo = append(undo, func() { header.Key = key }) header.Key = from keyAddr = from } if !endKeyAddr.Less(to) { undo = append(undo, func() { header.EndKey = endKey }) header.EndKey = to endKeyAddr = to } // Check whether the truncation has left any keys in the range. If not, // we need to cut it out of the request. return !keyAddr.Less(endKeyAddr), undo, nil } var fns []func() gUndo := func() { for _, f := range fns { f() } } var numNoop int for pos, arg := range br.Requests { omit, undo, err := truncateOne(arg.GetInner()) if omit { numNoop++ nReq := &roachpb.RequestUnion{} if !nReq.SetValue(&roachpb.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } oReq := br.Requests[pos] br.Requests[pos] = *nReq posCpy := pos // for closure undo = append(undo, func() { br.Requests[posCpy] = oReq }) } fns = append(fns, undo...) if err != nil { return gUndo, 0, err } } return gUndo, len(br.Requests) - numNoop, nil }