예제 #1
0
// updateState updates the transaction state in both the success and
// error cases, applying those updates to the corresponding txnMeta
// object when adequate. It also updates certain errors with the
// updated transaction for use by client restarts.
func (tc *TxnCoordSender) updateState(
	ctx context.Context,
	startNS int64,
	ba roachpb.BatchRequest,
	br *roachpb.BatchResponse,
	pErr *roachpb.Error,
) *roachpb.Error {

	tc.Lock()
	defer tc.Unlock()

	if ba.Txn == nil {
		// Not a transactional request.
		return pErr
	}

	var newTxn roachpb.Transaction
	newTxn.Update(ba.Txn)
	if pErr == nil {
		newTxn.Update(br.Txn)
	} else if errTxn := pErr.GetTxn(); errTxn != nil {
		newTxn.Update(errTxn)
	}

	switch t := pErr.GetDetail().(type) {
	case *roachpb.OpRequiresTxnError:
		panic("OpRequiresTxnError must not happen at this level")
	case *roachpb.ReadWithinUncertaintyIntervalError:
		// If the reader encountered a newer write within the uncertainty
		// interval, we advance the txn's timestamp just past the last observed
		// timestamp from the node.
		restartTS, ok := newTxn.GetObservedTimestamp(pErr.OriginNode)
		if !ok {
			pErr = roachpb.NewError(errors.Errorf("no observed timestamp for node %d found on uncertainty restart", pErr.OriginNode))
		} else {
			newTxn.Timestamp.Forward(restartTS)
			newTxn.Restart(ba.UserPriority, newTxn.Priority, newTxn.Timestamp)
		}
	case *roachpb.TransactionAbortedError:
		// Increase timestamp if applicable.
		newTxn.Timestamp.Forward(pErr.GetTxn().Timestamp)
		newTxn.Priority = pErr.GetTxn().Priority
		// Clean up the freshly aborted transaction in defer(), avoiding a
		// race with the state update below.
		defer tc.cleanupTxnLocked(ctx, newTxn)
	case *roachpb.TransactionPushError:
		// Increase timestamp if applicable, ensuring that we're
		// just ahead of the pushee.
		newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp)
		newTxn.Restart(ba.UserPriority, t.PusheeTxn.Priority-1, newTxn.Timestamp)
	case *roachpb.TransactionRetryError:
		// Increase timestamp so on restart, we're ahead of any timestamp
		// cache entries or newer versions which caused the restart.
		newTxn.Restart(ba.UserPriority, pErr.GetTxn().Priority, newTxn.Timestamp)
	case *roachpb.WriteTooOldError:
		newTxn.Restart(ba.UserPriority, newTxn.Priority, t.ActualTimestamp)
	case nil:
		// Nothing to do here, avoid the default case.
	default:
		// Do not clean up the transaction since we're leaving cancellation of
		// the transaction up to the client. For example, on seeing an error,
		// like TransactionStatusError or ConditionFailedError, the client
		// will call Txn.CleanupOnError() which will cleanup the transaction
		// and its intents. Therefore leave the transaction in the PENDING
		// state and do not call cleanTxnLocked().
	}

	txnID := *newTxn.ID

	txnMeta := tc.txns[txnID]
	// For successful transactional requests, keep the written intents and
	// the updated transaction record to be sent along with the reply.
	// The transaction metadata is created with the first writing operation.
	// A tricky edge case is that of a transaction which "fails" on the
	// first writing request, but actually manages to write some intents
	// (for example, due to being multi-range). In this case, there will
	// be an error, but the transaction will be marked as Writing and the
	// coordinator must track the state, for the client's retry will be
	// performed with a Writing transaction which the coordinator rejects
	// unless it is tracking it (on top of it making sense to track it;
	// after all, it **has** laid down intents and only the coordinator
	// can augment a potential EndTransaction call). See #3303.
	if txnMeta != nil || pErr == nil || newTxn.Writing {
		// Adding the intents even on error reduces the likelihood of dangling
		// intents blocking concurrent writers for extended periods of time.
		// See #3346.
		var keys []roachpb.Span
		if txnMeta != nil {
			keys = txnMeta.keys
		}
		ba.IntentSpanIterate(br, func(key, endKey roachpb.Key) {
			keys = append(keys, roachpb.Span{
				Key:    key,
				EndKey: endKey,
			})
		})

		if txnMeta != nil {
			txnMeta.keys = keys
		} else if len(keys) > 0 {
			if !newTxn.Writing {
				panic("txn with intents marked as non-writing")
			}
			// If the transaction is already over, there's no point in
			// launching a one-off coordinator which will shut down right
			// away. If we ended up here with an error, we'll always start
			// the coordinator - the transaction has laid down intents, so
			// we expect it to be committed/aborted at some point in the
			// future.
			if _, isEnding := ba.GetArg(roachpb.EndTransaction); pErr != nil || !isEnding {
				log.Event(ctx, "coordinator spawns")
				txnMeta = &txnMetadata{
					txn:              newTxn,
					keys:             keys,
					firstUpdateNanos: startNS,
					lastUpdateNanos:  tc.clock.PhysicalNow(),
					timeoutDuration:  tc.clientTimeout,
					txnEnd:           make(chan struct{}),
				}
				tc.txns[txnID] = txnMeta

				if err := tc.stopper.RunAsyncTask(ctx, func(ctx context.Context) {
					tc.heartbeatLoop(ctx, txnID)
				}); err != nil {
					// The system is already draining and we can't start the
					// heartbeat. We refuse new transactions for now because
					// they're likely not going to have all intents committed.
					// In principle, we can relax this as needed though.
					tc.unregisterTxnLocked(txnID)
					return roachpb.NewError(err)
				}
			} else {
				// If this was a successful one phase commit, update stats
				// directly as they won't otherwise be updated on heartbeat
				// loop shutdown.
				etArgs, ok := br.Responses[len(br.Responses)-1].GetInner().(*roachpb.EndTransactionResponse)
				tc.updateStats(tc.clock.PhysicalNow()-startNS, 0, newTxn.Status, ok && etArgs.OnePhaseCommit)
			}
		}
	}

	// Update our record of this transaction, even on error.
	if txnMeta != nil {
		txnMeta.txn.Update(&newTxn)
		if !txnMeta.txn.Writing {
			panic("tracking a non-writing txn")
		}
		txnMeta.setLastUpdate(tc.clock.PhysicalNow())
	}

	if pErr == nil {
		// For successful transactional requests, always send the updated txn
		// record back. Note that we make sure not to share data with newTxn
		// (which may have made it into txnMeta).
		if br.Txn != nil {
			br.Txn.Update(&newTxn)
		} else {
			clonedTxn := newTxn.Clone()
			br.Txn = &clonedTxn
		}
	} else if pErr.GetTxn() != nil {
		// Avoid changing existing errors because sometimes they escape into
		// goroutines and data races can occur.
		pErrShallow := *pErr
		pErrShallow.SetTxn(&newTxn) // SetTxn clones newTxn
		pErr = &pErrShallow
	}

	return pErr
}
예제 #2
0
// sendToReplicas sends one or more RPCs to clients specified by the
// slice of replicas. On success, Send returns the first successful
// reply. If an error occurs which is not specific to a single
// replica, it's returned immediately. Otherwise, when all replicas
// have been tried and failed, returns a send error.
func (ds *DistSender) sendToReplicas(
	opts SendOptions,
	rangeID roachpb.RangeID,
	replicas ReplicaSlice,
	args roachpb.BatchRequest,
	rpcContext *rpc.Context,
) (*roachpb.BatchResponse, error) {
	if len(replicas) < 1 {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d",
				len(replicas), 1))
	}

	var ambiguousResult bool
	var haveCommit bool
	// We only check for committed txns, not aborts because aborts may
	// be retried without any risk of inconsistencies.
	if etArg, ok := args.GetArg(roachpb.EndTransaction); ok &&
		etArg.(*roachpb.EndTransactionRequest).Commit {
		haveCommit = true
	}
	done := make(chan BatchCall, len(replicas))

	transportFactory := opts.transportFactory
	if transportFactory == nil {
		transportFactory = grpcTransportFactory
	}
	transport, err := transportFactory(opts, rpcContext, replicas, args)
	if err != nil {
		return nil, err
	}
	defer transport.Close()
	if transport.IsExhausted() {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("sending to all %d replicas failed", len(replicas)))
	}

	// Send the first request.
	pending := 1
	log.VEventf(opts.ctx, 2, "sending RPC for batch: %s", args.Summary())
	transport.SendNext(done)

	// Wait for completions. This loop will retry operations that fail
	// with errors that reflect per-replica state and may succeed on
	// other replicas.
	var sendNextTimer timeutil.Timer
	defer sendNextTimer.Stop()
	for {
		sendNextTimer.Reset(opts.SendNextTimeout)
		select {
		case <-sendNextTimer.C:
			sendNextTimer.Read = true
			// On successive RPC timeouts, send to additional replicas if available.
			if !transport.IsExhausted() {
				log.VEventf(opts.ctx, 2, "timeout, trying next peer")
				pending++
				transport.SendNext(done)
			}

		case call := <-done:
			pending--
			err := call.Err
			if err == nil {
				if log.V(2) {
					log.Infof(opts.ctx, "RPC reply: %s", call.Reply)
				} else if log.V(1) && call.Reply.Error != nil {
					log.Infof(opts.ctx, "application error: %s", call.Reply.Error)
				}

				if call.Reply.Error == nil {
					return call.Reply, nil
				} else if !ds.handlePerReplicaError(opts.ctx, transport, rangeID, call.Reply.Error) {
					// The error received is not specific to this replica, so we
					// should return it instead of trying other replicas. However,
					// if we're trying to commit a transaction and there are
					// still other RPCs outstanding or an ambiguous RPC error
					// was already received, we must return an ambiguous commit
					// error instead of returned error.
					if haveCommit && (pending > 0 || ambiguousResult) {
						return nil, roachpb.NewAmbiguousResultError()
					}
					return call.Reply, nil
				}

				// Extract the detail so it can be included in the error
				// message if this is our last replica.
				//
				// TODO(bdarnell): The last error is not necessarily the best
				// one to return; we may want to remember the "best" error
				// we've seen (for example, a NotLeaseHolderError conveys more
				// information than a RangeNotFound).
				err = call.Reply.Error.GoError()
			} else {
				if log.V(1) {
					log.Warningf(opts.ctx, "RPC error: %s", err)
				}
				// All connection errors except for an unavailable node (this
				// is GRPC's fail-fast error), may mean that the request
				// succeeded on the remote server, but we were unable to
				// receive the reply. Set the ambiguous commit flag.
				//
				// We retry ambiguous commit batches to avoid returning the
				// unrecoverable AmbiguousResultError. This is safe because
				// repeating an already-successfully applied batch is
				// guaranteed to return either a TransactionReplayError (in
				// case the replay happens at the original leader), or a
				// TransactionRetryError (in case the replay happens at a new
				// leader). If the original attempt merely timed out or was
				// lost, then the batch will succeed and we can be assured the
				// commit was applied just once.
				//
				// The Unavailable code is used by GRPC to indicate that a
				// request fails fast and is not sent, so we can be sure there
				// is no ambiguity on these errors. Note that these are common
				// if a node is down.
				// See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/call.go#L182
				// See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/stream.go#L158
				if haveCommit && grpc.Code(err) != codes.Unavailable {
					ambiguousResult = true
				}
			}

			// Send to additional replicas if available.
			if !transport.IsExhausted() {
				log.VEventf(opts.ctx, 2, "error, trying next peer: %s", err)
				pending++
				transport.SendNext(done)
			}
			if pending == 0 {
				if ambiguousResult {
					err = roachpb.NewAmbiguousResultError()
				} else {
					err = roachpb.NewSendError(
						fmt.Sprintf("sending to all %d replicas failed; last error: %v", len(replicas), err),
					)
				}
				if log.V(2) {
					log.ErrEvent(opts.ctx, err.Error())
				}
				return nil, err
			}
		}
	}
}
예제 #3
0
// Send implements the batch.Sender interface. If the request is part of a
// transaction, the TxnCoordSender adds the transaction to a map of active
// transactions and begins heartbeating it. Every subsequent request for the
// same transaction updates the lastUpdate timestamp to prevent live
// transactions from being considered abandoned and garbage collected.
// Read/write mutating requests have their key or key range added to the
// transaction's interval tree of key ranges for eventual cleanup via resolved
// write intents; they're tagged to an outgoing EndTransaction request, with
// the receiving replica in charge of resolving them.
func (tc *TxnCoordSender) Send(
	ctx context.Context, ba roachpb.BatchRequest,
) (*roachpb.BatchResponse, *roachpb.Error) {
	// Start new or pick up active trace. From here on, there's always an active
	// Trace, though its overhead is small unless it's sampled.
	sp := opentracing.SpanFromContext(ctx)
	var tracer opentracing.Tracer
	if sp == nil {
		tracer = tc.AmbientContext.Tracer
		sp = tracer.StartSpan(opTxnCoordSender)
		defer sp.Finish()
		ctx = opentracing.ContextWithSpan(ctx, sp)
	} else {
		tracer = sp.Tracer()
	}

	startNS := tc.clock.PhysicalNow()

	if ba.Txn != nil {
		// If this request is part of a transaction...
		if err := tc.maybeBeginTxn(&ba); err != nil {
			return nil, roachpb.NewError(err)
		}

		txnID := *ba.Txn.ID

		// Associate the txnID with the trace. We need to do this after the
		// maybeBeginTxn call. We set both a baggage item and a tag because only
		// tags show up in the LIghtstep UI.
		txnIDStr := txnID.String()
		sp.SetTag("txnID", txnIDStr)
		sp.SetBaggageItem("txnID", txnIDStr)

		var et *roachpb.EndTransactionRequest
		var hasET bool
		{
			var rArgs roachpb.Request
			rArgs, hasET = ba.GetArg(roachpb.EndTransaction)
			if hasET {
				et = rArgs.(*roachpb.EndTransactionRequest)
				if len(et.Key) != 0 {
					return nil, roachpb.NewErrorf("EndTransaction must not have a Key set")
				}
				et.Key = ba.Txn.Key
				if len(et.IntentSpans) > 0 {
					// TODO(tschottdorf): it may be useful to allow this later.
					// That would be part of a possible plan to allow txns which
					// write on multiple coordinators.
					return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction")
				}
			}
		}

		if pErr := func() *roachpb.Error {
			tc.Lock()
			defer tc.Unlock()
			if pErr := tc.maybeRejectClientLocked(ctx, *ba.Txn); pErr != nil {
				return pErr
			}

			if !hasET {
				return nil
			}
			// Everything below is carried out only when trying to commit.

			// Populate et.IntentSpans, taking into account both any existing
			// and new writes, and taking care to perform proper deduplication.
			txnMeta := tc.txns[txnID]
			distinctSpans := true
			if txnMeta != nil {
				et.IntentSpans = txnMeta.keys
				// Defensively set distinctSpans to false if we had any previous
				// requests in this transaction. This effectively limits the distinct
				// spans optimization to 1pc transactions.
				distinctSpans = len(txnMeta.keys) == 0
			}
			// We can't pass in a batch response here to better limit the key
			// spans as we don't know what is going to be affected. This will
			// affect queries such as `DELETE FROM my.table LIMIT 10` when
			// executed as a 1PC transaction. e.g.: a (BeginTransaction,
			// DeleteRange, EndTransaction) batch.
			ba.IntentSpanIterate(nil, func(key, endKey roachpb.Key) {
				et.IntentSpans = append(et.IntentSpans, roachpb.Span{
					Key:    key,
					EndKey: endKey,
				})
			})
			// TODO(peter): Populate DistinctSpans on all batches, not just batches
			// which contain an EndTransactionRequest.
			var distinct bool
			// The request might already be used by an outgoing goroutine, so
			// we can't safely mutate anything in-place (as MergeSpans does).
			et.IntentSpans = append([]roachpb.Span(nil), et.IntentSpans...)
			et.IntentSpans, distinct = roachpb.MergeSpans(et.IntentSpans)
			ba.Header.DistinctSpans = distinct && distinctSpans
			if len(et.IntentSpans) == 0 {
				// If there aren't any intents, then there's factually no
				// transaction to end. Read-only txns have all of their state
				// in the client.
				return roachpb.NewErrorf("cannot commit a read-only transaction")
			}
			if txnMeta != nil {
				txnMeta.keys = et.IntentSpans
			}
			return nil
		}(); pErr != nil {
			return nil, pErr
		}

		if hasET && log.V(1) {
			for _, intent := range et.IntentSpans {
				log.Eventf(ctx, "intent: [%s,%s)", intent.Key, intent.EndKey)
			}
		}
	}

	// Embed the trace metadata into the header for use by RPC recipients. We need
	// to do this after the maybeBeginTxn call above.
	// TODO(tschottdorf): To get rid of the spurious alloc below we need to
	// implement the carrier interface on ba.Header or make Span non-nullable,
	// both of which force all of ba on the Heap. It's already there, so may
	// not be a big deal, but ba should live on the stack. Also not easy to use
	// a buffer pool here since anything that goes into the RPC layer could be
	// used by goroutines we didn't wait for.
	if ba.TraceContext == nil {
		ba.TraceContext = &tracing.SpanContextCarrier{}
	} else {
		// We didn't make this object but are about to mutate it, so we
		// have to take a copy - the original might already have been
		// passed to the RPC layer.
		ba.TraceContext = protoutil.Clone(ba.TraceContext).(*tracing.SpanContextCarrier)
	}
	if err := tracer.Inject(sp.Context(), basictracer.Delegator, ba.TraceContext); err != nil {
		return nil, roachpb.NewError(err)
	}

	// Send the command through wrapped sender, taking appropriate measures
	// on error.
	var br *roachpb.BatchResponse
	{
		var pErr *roachpb.Error
		br, pErr = tc.wrapped.Send(ctx, ba)

		if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok {
			// TODO(tschottdorf): needs to keep the trace.
			br, pErr = tc.resendWithTxn(ba)
		}

		if pErr = tc.updateState(ctx, startNS, ba, br, pErr); pErr != nil {
			log.Eventf(ctx, "error: %s", pErr)
			return nil, pErr
		}
	}

	if br.Txn == nil {
		return br, nil
	}

	if _, ok := ba.GetArg(roachpb.EndTransaction); !ok {
		return br, nil
	}
	// If the --linearizable flag is set, we want to make sure that
	// all the clocks in the system are past the commit timestamp
	// of the transaction. This is guaranteed if either
	// - the commit timestamp is MaxOffset behind startNS
	// - MaxOffset ns were spent in this function
	// when returning to the client. Below we choose the option
	// that involves less waiting, which is likely the first one
	// unless a transaction commits with an odd timestamp.
	if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS {
		startNS = tsNS
	}
	sleepNS := tc.clock.MaxOffset() -
		time.Duration(tc.clock.PhysicalNow()-startNS)
	if tc.linearizable && sleepNS > 0 {
		defer func() {
			if log.V(1) {
				log.Infof(ctx, "%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond))
			}
			time.Sleep(sleepNS)
		}()
	}
	if br.Txn.Status != roachpb.PENDING {
		tc.Lock()
		tc.cleanupTxnLocked(ctx, *br.Txn)
		tc.Unlock()
	}
	return br, nil
}