// TestSendRPCRetry verifies that sendRPC failed on first address but succeed on // second address, the second reply should be successfully returned back. func TestSendRPCRetry(t *testing.T) { defer leaktest.AfterTest(t) g, s := makeTestGossip(t) defer s() if err := g.SetNodeDescriptor(&proto.NodeDescriptor{NodeID: 1}); err != nil { t.Fatal(err) } // Fill RangeDescriptor with 2 replicas var descriptor = proto.RangeDescriptor{ RaftID: 1, StartKey: proto.Key("a"), EndKey: proto.Key("z"), } for i := 1; i <= 2; i++ { addr := util.MakeUnresolvedAddr("tcp", fmt.Sprintf("node%d", i)) nd := &proto.NodeDescriptor{ NodeID: proto.NodeID(i), Address: proto.Addr{ Network: addr.Network(), Address: addr.String(), }, } if err := g.AddInfo(gossip.MakeNodeIDKey(proto.NodeID(i)), nd, time.Hour); err != nil { t.Fatal(err) } descriptor.Replicas = append(descriptor.Replicas, proto.Replica{ NodeID: proto.NodeID(i), StoreID: proto.StoreID(i), }) } // Define our rpcSend stub which returns success on the second address. var testFn rpcSendFn = func(_ rpc.Options, method string, addrs []net.Addr, getArgs func(addr net.Addr) interface{}, getReply func() interface{}, _ *rpc.Context) ([]interface{}, error) { if method == "Node.Scan" { // reply from first address failed _ = getReply() // reply from second address succeed reply := getReply() reply.(*proto.ScanResponse).Rows = append([]proto.KeyValue{}, proto.KeyValue{Key: proto.Key("b"), Value: proto.Value{}}) return []interface{}{reply}, nil } return nil, util.Errorf("Not expected method %v", method) } ctx := &DistSenderContext{ rpcSend: testFn, rangeDescriptorDB: mockRangeDescriptorDB(func(_ proto.Key, _ lookupOptions) ([]proto.RangeDescriptor, error) { return []proto.RangeDescriptor{descriptor}, nil }), } ds := NewDistSender(ctx, g) call := proto.ScanCall(proto.Key("a"), proto.Key("d"), 1) sr := call.Reply.(*proto.ScanResponse) ds.Send(context.Background(), call) if err := sr.GoError(); err != nil { t.Fatal(err) } if l := len(sr.Rows); l != 1 { t.Fatalf("expected 1 row; got %d", l) } }
// Send implements the client.Sender interface. It verifies // permissions and looks up the appropriate range based on the // supplied key and sends the RPC according to the specified options. // // If the request spans multiple ranges (which is possible for Scan or // DeleteRange requests), Send sends requests to the individual ranges // sequentially and combines the results transparently. // // This may temporarily adjust the request headers, so the proto.Call // must not be used concurrently until Send has returned. func (ds *DistSender) Send(_ context.Context, call proto.Call) { args := call.Args finalReply := call.Reply // Verify permissions. if err := ds.verifyPermissions(call.Args); err != nil { call.Reply.Header().SetGoError(err) return } // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) { // Make sure that after the call, args hasn't changed. defer func(timestamp proto.Timestamp) { args.Header().Timestamp = timestamp }(args.Header().Timestamp) args.Header().Timestamp = ds.clock.Now() } // If this is a bounded request, we will change its bound as we receive // replies. This undoes that when we return. boundedArgs, argsBounded := args.(proto.Bounded) if argsBounded { defer func(bound int64) { boundedArgs.SetBound(bound) }(boundedArgs.GetBound()) } defer func(key proto.Key) { args.Header().Key = key }(args.Header().Key) // Retry logic for lookup of range by key and RPCs to range replicas. curReply := finalReply for { call.Reply = curReply curReply.Header().Reset() var desc, descNext *proto.RangeDescriptor var err error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). // sendAttempt below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. desc, descNext, err = ds.getDescriptors(call) // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(util.Retryable); ok && rErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } break } err = func() error { // Truncate the request to our current range, making sure not to // touch it unless we have to (it is illegal to send EndKey on // commands which do not operate on ranges). if descNext != nil { defer func(endKey proto.Key) { args.Header().EndKey = endKey }(args.Header().EndKey) args.Header().EndKey = desc.EndKey } leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } return ds.sendRPC(desc.RaftID, replicas, order, args, curReply) }() if err != nil { // For an RPC error to occur, we must've been unable to contact any // replicas. In this case, likely all nodes are down (or not getting back // to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date replicas, so // clearing the descriptor here should be a good idea. // TODO(tschottdorf): If a replica group goes dead, this will cause clients // to put high read pressure on the first range, so there should be some // rate limiting here. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } else { err = curReply.Header().GoError() } if err != nil { if log.V(1) { log.Warningf("failed to invoke %s: %s", call.Method(), err) } // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(err) } continue case *proto.NotLeaderError: newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader) if log.V(1) { log.Warning(err) } r.Reset() continue case util.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } } } break } // Immediately return if querying a range failed non-retryably. // For multi-range requests, we return the failing range's reply. if err != nil { call.Reply.Header().SetGoError(err) return } if finalReply != curReply { // This was the second or later call in a multi-range request. // Combine the new response with the existing one. if cFinalReply, ok := finalReply.(proto.Combinable); ok { cFinalReply.Combine(curReply) } else { // This should never apply in practice, as we'll only end up here // for range-spanning requests. call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type")) return } } // If this request has a bound, such as MaxResults in // ScanRequest, check whether enough rows have been retrieved. if argsBounded { if prevBound := boundedArgs.GetBound(); prevBound > 0 { if cReply, ok := curReply.(proto.Countable); ok { if nextBound := prevBound - cReply.Count(); nextBound > 0 { // Update bound for the next round. // We've deferred restoring the original bound earlier. boundedArgs.SetBound(nextBound) } else { // Set flag to break the loop. descNext = nil } } } } // If this was the last range accessed by this call, exit loop. if descNext == nil { break } // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. args.Header().Key = desc.EndKey // This is a multi-range request, make a new reply object for // subsequent iterations of the loop. curReply = args.CreateReply() } call.Reply = finalReply }
func containsKeyRange(desc proto.RangeDescriptor, start, end proto.Key) bool { return desc.ContainsKeyRange(keys.KeyAddress(start), keys.KeyAddress(end)) }
func containsKey(desc proto.RangeDescriptor, key proto.Key) bool { return desc.ContainsKey(keys.KeyAddress(key)) }
// truncate restricts all contained requests to the given key range. // Even on error, the returned closure must be executed; it undoes any // truncations performed. // First, the boundaries of the truncation are obtained: This is the // intersection between [from,to) and the descriptor's range. // Secondly, all requests contained in the batch are "truncated" to // the resulting range, inserting NoopRequest appropriately to // replace requests which are left without a key range to operate on. // The number of non-noop requests after truncation is returned along // with a closure which must be executed to undo the truncation, even // in case of an error. // TODO(tschottdorf): Consider returning a new BatchRequest, which has more // overhead in the common case of a batch which never needs truncation but is // less magical. func truncate(br *proto.BatchRequest, desc *proto.RangeDescriptor, from, to proto.Key) (func(), int, error) { if !desc.ContainsKey(from) { from = desc.StartKey } if !desc.ContainsKeyRange(desc.StartKey, to) || to == nil { to = desc.EndKey } truncateOne := func(args proto.Request) (bool, []func(), error) { if _, ok := args.(*proto.NoopRequest); ok { return true, nil, nil } header := args.Header() if !proto.IsRange(args) { if len(header.EndKey) > 0 { return false, nil, util.Errorf("%T is not a range command, but EndKey is set", args) } if !desc.ContainsKey(keys.KeyAddress(header.Key)) { return true, nil, nil } return false, nil, nil } var undo []func() key, endKey := header.Key, header.EndKey keyAddr, endKeyAddr := keys.KeyAddress(key), keys.KeyAddress(endKey) if keyAddr.Less(from) { undo = append(undo, func() { header.Key = key }) header.Key = from keyAddr = from } if !endKeyAddr.Less(to) { undo = append(undo, func() { header.EndKey = endKey }) header.EndKey = to endKeyAddr = to } // Check whether the truncation has left any keys in the range. If not, // we need to cut it out of the request. return !keyAddr.Less(endKeyAddr), undo, nil } var fns []func() gUndo := func() { for _, f := range fns { f() } } var numNoop int for pos, arg := range br.Requests { omit, undo, err := truncateOne(arg.GetValue().(proto.Request)) if omit { numNoop++ nReq := &proto.RequestUnion{} nReq.SetValue(&proto.NoopRequest{}) oReq := br.Requests[pos] br.Requests[pos] = *nReq posCpy := pos // for closure undo = append(undo, func() { br.Requests[posCpy] = oReq }) } fns = append(fns, undo...) if err != nil { return gUndo, 0, err } } return gUndo, len(br.Requests) - numNoop, nil }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). func (ds *DistSender) sendChunk(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, error) { // TODO(tschottdorf): prepare for removing Key and EndKey from BatchRequest, // making sure that anything that relies on them goes bust. ba.Key, ba.EndKey = nil, nil isReverse := ba.IsReverse() trace := tracer.FromCtx(ctx) // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). from, to := keys.Range(ba) var br *proto.BatchResponse // Send the request to one range per iteration. for { options := lookupOptions{ useReverseScan: isReverse, } var curReply *proto.BatchResponse var desc *proto.RangeDescriptor var needAnother bool var err error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") var evictDesc func() desc, needAnother, evictDesc, err = ds.getDescriptors(from, to, options) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(retry.Retryable); ok && rErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } break } // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if needAnother && ba.Txn == nil && ba.IsRange() && ba.ReadConsistency != proto.INCONSISTENT { return nil, &proto.OpRequiresTxnError{} } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. if (isReverse && !desc.ContainsKeyRange(desc.StartKey, to)) || (!isReverse && !desc.ContainsKeyRange(from, desc.EndKey)) { evictDesc() continue } curReply, err = func() (*proto.BatchResponse, error) { // Truncate the request to our current key range. untruncate, numActive, trErr := truncate(&ba, desc, from, to) if numActive == 0 { untruncate() // This shouldn't happen in the wild, but some tests // exercise it. return nil, util.Errorf("truncation resulted in empty batch on [%s,%s): %s", from, to, ba) } defer untruncate() if trErr != nil { return nil, trErr } // TODO(tschottdorf): make key range on batch redundant. The // requests within dictate it anyways. ba.Key, ba.EndKey = keys.Range(ba) reply, err := ds.sendAttempt(trace, ba, desc) ba.Key, ba.EndKey = nil, nil if err != nil { if log.V(0 /* TODO(tschottdorf): 1 */) { log.Warningf("failed to invoke %s: %s", ba, err) } } return reply, err }() // If sending succeeded, break this loop. if err == nil { break } // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *rpc.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. // TODO(tschottdorf): If a replica group goes dead, this // will cause clients to put high read pressure on the first // range, so there should be some rate limiting here. evictDesc() if tErr.CanRetry() { continue } case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: trace.Event(fmt.Sprintf("reply error: %T", err)) // Range descriptor might be out of date - evict it. evictDesc() // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(err) } // For the remainder of this call, we'll assume that intents // are fair game. This replaces more complex logic based on // the type of request. options.considerIntents = true continue case *proto.NotLeaderError: trace.Event(fmt.Sprintf("reply error: %T", err)) newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } evictDesc() } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(err) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(err) } trace.Event(fmt.Sprintf("reply error: %T", err)) continue } } break } // Immediately return if querying a range failed non-retryably. if err != nil { return nil, err } first := br == nil if first { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { panic(err) // TODO(tschottdorf): return nil, err } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if first { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]proto.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetValue() if _, ok := args.(*proto.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(proto.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetValue().(proto.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). ba.Requests[i].Reset() // necessary (no one-of?) if !ba.Requests[i].SetValue(&proto.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. to = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. from = next(ba, desc.EndKey) } trace.Event("querying next range") } }
// sendAttempt is invoked by Send and handles retry logic and cache eviction // for a call sent to a single range. It returns a retry status, which is Break // on success and either Break, Continue or Reset depending on error condition. // This method is expected to be invoked from within a backoff / retry loop to // retry the send repeatedly (e.g. to continue processing after a critical node // becomes available after downtime or the range descriptor is refreshed via // lookup). func (ds *DistSender) sendAttempt(desc *proto.RangeDescriptor, call proto.Call) (retry.Status, error) { leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) args := call.Args reply := call.Reply // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } err := ds.sendRPC(desc.RaftID, replicas, order, args, reply) if err != nil { // For an RPC error to occur, we must've been unable to contact any // replicas. In this case, likely all nodes are down (or not getting back // to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date replicas, so // clearing the descriptor here should be a good idea. // TODO(tschottdorf): If a replica group goes dead, this will cause clients // to put high read pressure on the first range, so there should be some // rate limiting here. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } else { err = reply.Header().GoError() } if err != nil { if log.V(1) { log.Warningf("failed to invoke %s: %s", call.Method(), err) } // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) // On addressing errors, don't backoff; retry immediately. return retry.Reset, err case *proto.NotLeaderError: newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader) return retry.Reset, err case util.Retryable: if tErr.CanRetry() { return retry.Continue, err } } return retry.Break, err } return retry.Break, nil }
// Send implements the client.Sender interface. It verifies // permissions and looks up the appropriate range based on the // supplied key and sends the RPC according to the specified options. // // If the request spans multiple ranges (which is possible for Scan or // DeleteRange requests), Send sends requests to the individual ranges // sequentially and combines the results transparently. // // This may temporarily adjust the request headers, so the proto.Call // must not be used concurrently until Send has returned. func (ds *DistSender) Send(ctx context.Context, call proto.Call) { args := call.Args // Verify permissions. if err := ds.verifyPermissions(call.Args); err != nil { call.Reply.Header().SetGoError(err) return } trace := tracer.FromCtx(ctx) // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) { // Make sure that after the call, args hasn't changed. defer func(timestamp proto.Timestamp) { args.Header().Timestamp = timestamp }(args.Header().Timestamp) args.Header().Timestamp = ds.clock.Now() } // If this is a bounded request, we will change its bound as we receive // replies. This undoes that when we return. boundedArgs, argsBounded := args.(proto.Bounded) if argsBounded { defer func(bound int64) { boundedArgs.SetBound(bound) }(boundedArgs.GetBound()) } _, isReverseScan := call.Args.(*proto.ReverseScanRequest) // Restore to the original range if the scan/reverse_scan crosses range boundaries. if isReverseScan { defer func(key proto.Key) { args.Header().EndKey = key }(args.Header().EndKey) } else { defer func(key proto.Key) { args.Header().Key = key }(args.Header().Key) } first := true // Retry logic for lookup of range by key and RPCs to range replicas. for { var curReply proto.Response var desc, descNext *proto.RangeDescriptor var err error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") // It is safe to pass call here (with its embedded reply) because // the reply is only used to check that it implements // proto.Combinable if the request spans multiple ranges. desc, descNext, err = ds.getDescriptors(call) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(retry.Retryable); ok && rErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } break } // At this point reply.Header().Error may be non-nil! curReply, err = ds.sendAttempt(trace, args, desc) descKey := args.Header().Key if isReverseScan { descKey = args.Header().EndKey } if err != nil { trace.Event(fmt.Sprintf("send error: %T", err)) // For an RPC error to occur, we must've been unable to contact any // replicas. In this case, likely all nodes are down (or not getting back // to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date replicas, so // clearing the descriptor here should be a good idea. // TODO(tschottdorf): If a replica group goes dead, this will cause clients // to put high read pressure on the first range, so there should be some // rate limiting here. ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, isReverseScan) } else { err = curReply.Header().GoError() } if err == nil { break } if log.V(1) { log.Warningf("failed to invoke %s: %s", call.Method(), err) } // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: trace.Event(fmt.Sprintf("reply error: %T", err)) // Range descriptor might be out of date - evict it. ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, isReverseScan) // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(err) } continue case *proto.NotLeaderError: trace.Event(fmt.Sprintf("reply error: %T", err)) newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, isReverseScan) } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(err) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(err) } trace.Event(fmt.Sprintf("reply error: %T", err)) continue } } break } // Immediately return if querying a range failed non-retryably. // For multi-range requests, we return the failing range's reply. if err != nil { call.Reply.Header().SetGoError(err) return } if first { // Equivalent of `*call.Reply = curReply`. Generics! dst := reflect.ValueOf(call.Reply).Elem() dst.Set(reflect.ValueOf(curReply).Elem()) } else { // This was the second or later call in a multi-range request. // Combine the new response with the existing one. if cReply, ok := call.Reply.(proto.Combinable); ok { cReply.Combine(curReply) } else { // This should never apply in practice, as we'll only end up here // for range-spanning requests. call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type")) return } } first = false // If this request has a bound, such as MaxResults in // ScanRequest, check whether enough rows have been retrieved. if argsBounded { if prevBound := boundedArgs.GetBound(); prevBound > 0 { if cReply, ok := curReply.(proto.Countable); ok { if nextBound := prevBound - cReply.Count(); nextBound > 0 { // Update bound for the next round. // We've deferred restoring the original bound earlier. boundedArgs.SetBound(nextBound) } else { // Set flag to break the loop. descNext = nil } } } } // If this was the last range accessed by this call, exit loop. if descNext == nil { break } if isReverseScan { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one. args.Header().EndKey = desc.StartKey } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. args.Header().Key = desc.EndKey } trace.Event("querying next range") } }
// TestSendRPCOrder verifies that sendRPC correctly takes into account the // leader, attributes and required consistency to determine where to send // remote requests. func TestSendRPCOrder(t *testing.T) { defer leaktest.AfterTest(t) g, s := makeTestGossip(t) defer s() rangeID := proto.RangeID(99) nodeAttrs := map[int32][]string{ 1: {}, // The local node, set in each test case. 2: {"us", "west", "gpu"}, 3: {"eu", "dublin", "pdu2", "gpu"}, 4: {"us", "east", "gpu"}, 5: {"us", "east", "gpu", "flaky"}, } // Gets filled below to identify the replica by its address. addrToNode := make(map[string]int32) makeVerifier := func(expOrder rpc.OrderingPolicy, expAddrs []int32) func(rpc.Options, []net.Addr) error { return func(o rpc.Options, addrs []net.Addr) error { if o.Ordering != expOrder { return util.Errorf("unexpected ordering, wanted %v, got %v", expOrder, o.Ordering) } var actualAddrs []int32 for i, a := range addrs { if len(expAddrs) <= i { return util.Errorf("got unexpected address: %s", a) } if expAddrs[i] == 0 { actualAddrs = append(actualAddrs, 0) } else { actualAddrs = append(actualAddrs, addrToNode[a.String()]) } } if !reflect.DeepEqual(expAddrs, actualAddrs) { return util.Errorf("expected %d, but found %d", expAddrs, actualAddrs) } return nil } } testCases := []struct { args proto.Request attrs []string order rpc.OrderingPolicy expReplica []int32 leader int32 // 0 for not caching a leader. // Naming is somewhat off, as eventually consistent reads usually // do not have to go to the leader when a node has a read lease. // Would really want CONSENSUS here, but that is not implemented. // Likely a test setup here will never have a read lease, but good // to keep in mind. consistent bool }{ // Inconsistent Scan without matching attributes. { args: &proto.ScanRequest{}, attrs: []string{}, order: rpc.OrderRandom, expReplica: []int32{1, 2, 3, 4, 5}, }, // Inconsistent Scan with matching attributes. // Should move the two nodes matching the attributes to the front and // go stable. { args: &proto.ScanRequest{}, attrs: nodeAttrs[5], order: rpc.OrderStable, // Compare only the first two resulting addresses. expReplica: []int32{5, 4, 0, 0, 0}, }, // Scan without matching attributes that requires but does not find // a leader. { args: &proto.ScanRequest{}, attrs: []string{}, order: rpc.OrderRandom, expReplica: []int32{1, 2, 3, 4, 5}, consistent: true, }, // Put without matching attributes that requires but does not find leader. // Should go random and not change anything. { args: &proto.PutRequest{}, attrs: []string{"nomatch"}, order: rpc.OrderRandom, expReplica: []int32{1, 2, 3, 4, 5}, }, // Put with matching attributes but no leader. // Should move the two nodes matching the attributes to the front and // go stable. { args: &proto.PutRequest{}, attrs: append(nodeAttrs[5], "irrelevant"), // Compare only the first two resulting addresses. order: rpc.OrderStable, expReplica: []int32{5, 4, 0, 0, 0}, }, // Put with matching attributes that finds the leader (node 3). // Should address the leader and the two nodes matching the attributes // (the last and second to last) in that order. { args: &proto.PutRequest{}, attrs: append(nodeAttrs[5], "irrelevant"), // Compare only the first resulting addresses as we have a leader // and that means we're only trying to send there. order: rpc.OrderStable, expReplica: []int32{2, 5, 4, 0, 0}, leader: 2, }, // Inconsistent Get without matching attributes but leader (node 3). Should just // go random as the leader does not matter. { args: &proto.GetRequest{}, attrs: []string{}, order: rpc.OrderRandom, expReplica: []int32{1, 2, 3, 4, 5}, leader: 2, }, } descriptor := proto.RangeDescriptor{ StartKey: proto.KeyMin, EndKey: proto.KeyMax, RangeID: rangeID, Replicas: nil, } // Stub to be changed in each test case. var verifyCall func(rpc.Options, []net.Addr) error var testFn rpcSendFn = func(opts rpc.Options, method string, addrs []net.Addr, _ func(addr net.Addr) gogoproto.Message, getReply func() gogoproto.Message, _ *rpc.Context) ([]gogoproto.Message, error) { if err := verifyCall(opts, addrs); err != nil { return nil, err } return []gogoproto.Message{getReply()}, nil } ctx := &DistSenderContext{ RPCSend: testFn, RangeDescriptorDB: mockRangeDescriptorDB(func(proto.Key, lookupOptions) ([]proto.RangeDescriptor, error) { return []proto.RangeDescriptor{descriptor}, nil }), } ds := NewDistSender(ctx, g) for n, tc := range testCases { verifyCall = makeVerifier(tc.order, tc.expReplica) descriptor.Replicas = nil // could do this once above, but more convenient here for i := int32(1); i <= 5; i++ { addr := util.MakeUnresolvedAddr("tcp", fmt.Sprintf("node%d", i)) addrToNode[addr.String()] = i nd := &proto.NodeDescriptor{ NodeID: proto.NodeID(i), Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()), Attrs: proto.Attributes{ Attrs: nodeAttrs[i], }, } if err := g.AddInfoProto(gossip.MakeNodeIDKey(proto.NodeID(i)), nd, time.Hour); err != nil { t.Fatal(err) } descriptor.Replicas = append(descriptor.Replicas, proto.Replica{ NodeID: proto.NodeID(i), StoreID: proto.StoreID(i), }) } { // The local node needs to get its attributes during sendRPC. nd := &proto.NodeDescriptor{ NodeID: 6, Attrs: proto.Attributes{ Attrs: tc.attrs, }, } if err := g.SetNodeDescriptor(nd); err != nil { t.Fatal(err) } } ds.leaderCache.Update(proto.RangeID(rangeID), proto.Replica{}) if tc.leader > 0 { ds.leaderCache.Update(proto.RangeID(rangeID), descriptor.Replicas[tc.leader-1]) } args := tc.args args.Header().RangeID = rangeID // Not used in this test, but why not. args.Header().Key = proto.Key("a") if proto.IsRange(args) { args.Header().EndKey = proto.Key("b") } if !tc.consistent { args.Header().ReadConsistency = proto.INCONSISTENT } // Kill the cached NodeDescriptor, enforcing a lookup from Gossip. ds.nodeDescriptor = nil if _, err := batchutil.SendWrapped(ds, args); err != nil { t.Errorf("%d: %s", n, err) } } }