Exemple #1
0
// sendRPC sends one or more RPCs to replicas from the supplied
// roachpb.Replica slice. Returns an RPC error if the request could
// not be sent. Note that the reply may contain a higher level error
// and must be checked in addition to the RPC error.
//
// The replicas are assumed to be ordered by preference, with closer
// ones (i.e. expected lowest latency) first.
func (ds *DistSender) sendRPC(
	ctx context.Context, rangeID roachpb.RangeID, replicas ReplicaSlice, ba roachpb.BatchRequest,
) (*roachpb.BatchResponse, error) {
	if len(replicas) == 0 {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("no replica node addresses available via gossip for range %d", rangeID))
	}

	// TODO(pmattis): This needs to be tested. If it isn't set we'll
	// still route the request appropriately by key, but won't receive
	// RangeNotFoundErrors.
	ba.RangeID = rangeID

	// Set RPC opts with stipulation that one of N RPCs must succeed.
	rpcOpts := SendOptions{
		ctx:              ctx,
		SendNextTimeout:  ds.sendNextTimeout,
		transportFactory: ds.transportFactory,
	}
	tracing.AnnotateTrace()
	defer tracing.AnnotateTrace()

	reply, err := ds.sendToReplicas(rpcOpts, rangeID, replicas, ba, ds.rpcContext)
	if err != nil {
		return nil, err
	}
	return reply, nil
}
Exemple #2
0
// Test the behavior of SendNextTimeout when some servers return
// RPC errors but one succeeds.
func TestSendNext_RPCErrorThenSuccess(t *testing.T) {
	defer leaktest.AfterTest(t)()

	doneChans, sendChan, stopper := setupSendNextTest(t)
	defer stopper.Stop()

	// Now that all replicas have been contacted, let two finish with
	// retryable errors.
	for i := 1; i <= 2; i++ {
		doneChans[i] <- BatchCall{
			Reply: nil,
			Err:   roachpb.NewSendError("boom"),
		}
	}

	// The client is still waiting for the third slow RPC to complete.
	select {
	case bc := <-sendChan:
		t.Fatalf("got unexpected response %v", bc)
	default:
	}

	// Now let the final server complete the RPC successfully.
	doneChans[0] <- BatchCall{
		Reply: &roachpb.BatchResponse{},
		Err:   nil,
	}

	// The client side now completes successfully.
	bc := <-sendChan
	if bc.Err != nil {
		t.Fatal(bc.Err)
	}
}
Exemple #3
0
// sendRPC sends one or more RPCs to replicas from the supplied
// roachpb.Replica slice. Returns an RPC error if the request could
// not be sent. Note that the reply may contain a higher level error
// and must be checked in addition to the RPC error.
//
// The replicas are assumed to be ordered by preference, with closer
// ones (i.e. expected lowest latency) first.
func (ds *DistSender) sendRPC(
	ctx context.Context, rangeID roachpb.RangeID, replicas ReplicaSlice, ba roachpb.BatchRequest,
) (*roachpb.BatchResponse, error) {
	if len(replicas) == 0 {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("no replica node addresses available via gossip for range %d", rangeID))
	}

	// TODO(pmattis): This needs to be tested. If it isn't set we'll
	// still route the request appropriately by key, but won't receive
	// RangeNotFoundErrors.
	ba.RangeID = rangeID

	// A given RPC may generate retries to multiple replicas, but as soon as we
	// get a response from one we want to cancel those other RPCs.
	ctx, cancel := context.WithCancel(ctx)
	defer cancel()

	// Set RPC opts with stipulation that one of N RPCs must succeed.
	rpcOpts := SendOptions{
		SendNextTimeout:  ds.sendNextTimeout,
		transportFactory: ds.transportFactory,
		metrics:          &ds.metrics,
	}
	tracing.AnnotateTrace()
	defer tracing.AnnotateTrace()

	reply, err := ds.sendToReplicas(ctx, rpcOpts, rangeID, replicas, ba, ds.rpcContext)
	if err != nil {
		return nil, err
	}
	return reply, nil
}
Exemple #4
0
func (f *firstNErrorTransport) SendNext(done chan<- BatchCall) {
	call := BatchCall{
		Reply: &roachpb.BatchResponse{},
	}
	if f.numSent < f.numErrors {
		call.Err = roachpb.NewSendError("test")
	}
	f.numSent++
	done <- call
}
Exemple #5
0
// Send implements the Sender interface.
func (s sender) Send(
	ctx context.Context, ba roachpb.BatchRequest,
) (*roachpb.BatchResponse, *roachpb.Error) {
	br, err := s.Batch(ctx, &ba)
	if err != nil {
		return nil, roachpb.NewError(roachpb.NewSendError(err.Error()))
	}
	pErr := br.Error
	br.Error = nil
	return br, pErr
}
Exemple #6
0
// sendToReplicas sends one or more RPCs to clients specified by the
// slice of replicas. On success, Send returns the first successful
// reply. If an error occurs which is not specific to a single
// replica, it's returned immediately. Otherwise, when all replicas
// have been tried and failed, returns a send error.
func (ds *DistSender) sendToReplicas(
	opts SendOptions,
	rangeID roachpb.RangeID,
	replicas ReplicaSlice,
	args roachpb.BatchRequest,
	rpcContext *rpc.Context,
) (*roachpb.BatchResponse, error) {
	if len(replicas) < 1 {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d",
				len(replicas), 1))
	}

	var ambiguousResult bool
	var haveCommit bool
	// We only check for committed txns, not aborts because aborts may
	// be retried without any risk of inconsistencies.
	if etArg, ok := args.GetArg(roachpb.EndTransaction); ok &&
		etArg.(*roachpb.EndTransactionRequest).Commit {
		haveCommit = true
	}
	done := make(chan BatchCall, len(replicas))

	transportFactory := opts.transportFactory
	if transportFactory == nil {
		transportFactory = grpcTransportFactory
	}
	transport, err := transportFactory(opts, rpcContext, replicas, args)
	if err != nil {
		return nil, err
	}
	defer transport.Close()
	if transport.IsExhausted() {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("sending to all %d replicas failed", len(replicas)))
	}

	// Send the first request.
	pending := 1
	log.VEventf(opts.ctx, 2, "sending RPC for batch: %s", args.Summary())
	transport.SendNext(done)

	// Wait for completions. This loop will retry operations that fail
	// with errors that reflect per-replica state and may succeed on
	// other replicas.
	var sendNextTimer timeutil.Timer
	defer sendNextTimer.Stop()
	for {
		sendNextTimer.Reset(opts.SendNextTimeout)
		select {
		case <-sendNextTimer.C:
			sendNextTimer.Read = true
			// On successive RPC timeouts, send to additional replicas if available.
			if !transport.IsExhausted() {
				log.VEventf(opts.ctx, 2, "timeout, trying next peer")
				pending++
				transport.SendNext(done)
			}

		case call := <-done:
			pending--
			err := call.Err
			if err == nil {
				if log.V(2) {
					log.Infof(opts.ctx, "RPC reply: %s", call.Reply)
				} else if log.V(1) && call.Reply.Error != nil {
					log.Infof(opts.ctx, "application error: %s", call.Reply.Error)
				}

				if call.Reply.Error == nil {
					return call.Reply, nil
				} else if !ds.handlePerReplicaError(opts.ctx, transport, rangeID, call.Reply.Error) {
					// The error received is not specific to this replica, so we
					// should return it instead of trying other replicas. However,
					// if we're trying to commit a transaction and there are
					// still other RPCs outstanding or an ambiguous RPC error
					// was already received, we must return an ambiguous commit
					// error instead of returned error.
					if haveCommit && (pending > 0 || ambiguousResult) {
						return nil, roachpb.NewAmbiguousResultError()
					}
					return call.Reply, nil
				}

				// Extract the detail so it can be included in the error
				// message if this is our last replica.
				//
				// TODO(bdarnell): The last error is not necessarily the best
				// one to return; we may want to remember the "best" error
				// we've seen (for example, a NotLeaseHolderError conveys more
				// information than a RangeNotFound).
				err = call.Reply.Error.GoError()
			} else {
				if log.V(1) {
					log.Warningf(opts.ctx, "RPC error: %s", err)
				}
				// All connection errors except for an unavailable node (this
				// is GRPC's fail-fast error), may mean that the request
				// succeeded on the remote server, but we were unable to
				// receive the reply. Set the ambiguous commit flag.
				//
				// We retry ambiguous commit batches to avoid returning the
				// unrecoverable AmbiguousResultError. This is safe because
				// repeating an already-successfully applied batch is
				// guaranteed to return either a TransactionReplayError (in
				// case the replay happens at the original leader), or a
				// TransactionRetryError (in case the replay happens at a new
				// leader). If the original attempt merely timed out or was
				// lost, then the batch will succeed and we can be assured the
				// commit was applied just once.
				//
				// The Unavailable code is used by GRPC to indicate that a
				// request fails fast and is not sent, so we can be sure there
				// is no ambiguity on these errors. Note that these are common
				// if a node is down.
				// See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/call.go#L182
				// See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/stream.go#L158
				if haveCommit && grpc.Code(err) != codes.Unavailable {
					ambiguousResult = true
				}
			}

			// Send to additional replicas if available.
			if !transport.IsExhausted() {
				log.VEventf(opts.ctx, 2, "error, trying next peer: %s", err)
				pending++
				transport.SendNext(done)
			}
			if pending == 0 {
				if ambiguousResult {
					err = roachpb.NewAmbiguousResultError()
				} else {
					err = roachpb.NewSendError(
						fmt.Sprintf("sending to all %d replicas failed; last error: %v", len(replicas), err),
					)
				}
				if log.V(2) {
					log.ErrEvent(opts.ctx, err.Error())
				}
				return nil, err
			}
		}
	}
}