Exemple #1
0
// NewCommandQueue returns a new command queue.
func NewCommandQueue() *CommandQueue {
	cq := &CommandQueue{
		tree: interval.Tree{Overlapper: interval.Range.OverlapExclusive},
		wRg:  interval.NewRangeTree(),
		rwRg: interval.NewRangeTree(),
	}
	return cq
}
// NewCommandQueue returns a new command queue.
func NewCommandQueue() *CommandQueue {
	cq := &CommandQueue{
		cache: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}),
		wRg:   interval.NewRangeTree(),
		rwRg:  interval.NewRangeTree(),
	}
	cq.cache.OnEvicted = cq.onEvicted
	return cq
}
// updateState updates the transaction state in both the success and
// error cases, applying those updates to the corresponding txnMeta
// object when adequate. It also updates certain errors with the
// updated transaction for use by client restarts.
func (tc *TxnCoordSender) updateState(
	startNS int64, ctx context.Context, ba roachpb.BatchRequest,
	br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error {
	newTxn := &roachpb.Transaction{}
	newTxn.Update(ba.Txn)
	if pErr == nil {
		newTxn.Update(br.Txn)
	} else {
		newTxn.Update(pErr.GetTxn())
	}

	switch t := pErr.GetDetail().(type) {
	case *roachpb.TransactionStatusError:
		// Likely already committed or more obscure errors such as epoch or
		// timestamp regressions; consider txn dead.
		defer tc.cleanupTxn(ctx, *pErr.GetTxn())
	case *roachpb.OpRequiresTxnError:
		panic("OpRequiresTxnError must not happen at this level")
	case *roachpb.ReadWithinUncertaintyIntervalError:
		// If the reader encountered a newer write within the uncertainty
		// interval, we advance the txn's timestamp just past the last observed
		// timestamp from the node.
		restartTS, ok := newTxn.GetObservedTimestamp(pErr.OriginNode)
		if !ok {
			pErr = roachpb.NewError(util.Errorf("no observed timestamp for node %d found on uncertainty restart", pErr.OriginNode))
		} else {
			newTxn.Timestamp.Forward(restartTS)
			newTxn.Restart(ba.UserPriority, newTxn.Priority, newTxn.Timestamp)
		}
	case *roachpb.TransactionAbortedError:
		// Increase timestamp if applicable.
		newTxn.Timestamp.Forward(pErr.GetTxn().Timestamp)
		newTxn.Priority = pErr.GetTxn().Priority
		// Clean up the freshly aborted transaction in defer(), avoiding a
		// race with the state update below.
		defer tc.cleanupTxn(ctx, *newTxn)
	case *roachpb.TransactionPushError:
		// Increase timestamp if applicable, ensuring that we're
		// just ahead of the pushee.
		newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp)
		newTxn.Restart(ba.UserPriority, t.PusheeTxn.Priority-1, newTxn.Timestamp)
	case *roachpb.TransactionRetryError:
		// Increase timestamp so on restart, we're ahead of any timestamp
		// cache entries or newer versions which caused the restart.
		newTxn.Restart(ba.UserPriority, pErr.GetTxn().Priority, newTxn.Timestamp)
	case *roachpb.WriteTooOldError:
		newTxn.Restart(ba.UserPriority, newTxn.Priority, t.ActualTimestamp)
	case nil:
		// Nothing to do here, avoid the default case.
	default:
		if pErr.GetTxn() != nil {
			if pErr.CanRetry() {
				panic("Retryable internal error must not happen at this level")
			} else {
				// Do not clean up the transaction here since the client might still
				// want to continue the transaction. For example, a client might
				// continue its transaction after receiving ConditionFailedError, which
				// can come from a unique index violation.
			}
		}
	}

	if pErr != nil && pErr.GetTxn() != nil {
		// Avoid changing existing errors because sometimes they escape into
		// goroutines and then there are races. Fairly sure there isn't one
		// here, but better safe than sorry.
		pErrShallow := *pErr
		pErrShallow.SetTxn(newTxn)
		pErr = &pErrShallow
	}

	if newTxn.ID == nil {
		return pErr
	}
	txnID := *newTxn.ID
	tc.Lock()
	defer tc.Unlock()
	txnMeta := tc.txns[txnID]
	// For successful transactional requests, keep the written intents and
	// the updated transaction record to be sent along with the reply.
	// The transaction metadata is created with the first writing operation.
	// A tricky edge case is that of a transaction which "fails" on the
	// first writing request, but actually manages to write some intents
	// (for example, due to being multi-range). In this case, there will
	// be an error, but the transaction will be marked as Writing and the
	// coordinator must track the state, for the client's retry will be
	// performed with a Writing transaction which the coordinator rejects
	// unless it is tracking it (on top of it making sense to track it;
	// after all, it **has** laid down intents and only the coordinator
	// can augment a potential EndTransaction call). See #3303.
	var intentGroup interval.RangeGroup
	if txnMeta != nil {
		intentGroup = txnMeta.keys
	} else if pErr == nil || newTxn.Writing {
		intentGroup = interval.NewRangeTree()
	}
	if intentGroup != nil {
		// Adding the intents even on error reduces the likelihood of dangling
		// intents blocking concurrent writers for extended periods of time.
		// See #3346.
		ba.IntentSpanIterate(func(key, endKey roachpb.Key) {
			addKeyRange(intentGroup, key, endKey)
		})

		if txnMeta == nil && intentGroup.Len() > 0 {
			if !newTxn.Writing {
				panic("txn with intents marked as non-writing")
			}
			// If the transaction is already over, there's no point in
			// launching a one-off coordinator which will shut down right
			// away. If we ended up here with an error, we'll always start
			// the coordinator - the transaction has laid down intents, so
			// we expect it to be committed/aborted at some point in the
			// future.
			if _, isEnding := ba.GetArg(roachpb.EndTransaction); pErr != nil || !isEnding {
				log.Trace(ctx, "coordinator spawns")
				txnMeta = &txnMetadata{
					txn:              *newTxn,
					keys:             intentGroup,
					firstUpdateNanos: startNS,
					lastUpdateNanos:  tc.clock.PhysicalNow(),
					timeoutDuration:  tc.clientTimeout,
					txnEnd:           make(chan struct{}),
				}
				tc.txns[txnID] = txnMeta

				if !tc.stopper.RunAsyncTask(func() {
					tc.heartbeatLoop(ctx, txnID)
				}) {
					// The system is already draining and we can't start the
					// heartbeat. We refuse new transactions for now because
					// they're likely not going to have all intents committed.
					// In principle, we can relax this as needed though.
					tc.unregisterTxnLocked(txnID)
					return roachpb.NewError(&roachpb.NodeUnavailableError{})
				}
			} else {
				// If this was a successful one phase commit, update stats
				// directly as they won't otherwise be updated on heartbeat
				// loop shutdown.
				etArgs, ok := br.Responses[len(br.Responses)-1].GetInner().(*roachpb.EndTransactionResponse)
				tc.updateStats(tc.clock.PhysicalNow()-startNS, 0, newTxn.Status, ok && etArgs.OnePhaseCommit)
			}
		}
	}

	// Update our record of this transaction, even on error.
	if txnMeta != nil {
		txnMeta.txn = *newTxn
		if !txnMeta.txn.Writing {
			panic("tracking a non-writing txn")
		}
		txnMeta.setLastUpdate(tc.clock.PhysicalNow())
	}
	if pErr == nil {
		// For successful transactional requests, always send the updated txn
		// record back.
		br.Txn = newTxn
	}
	return pErr
}
// Send implements the batch.Sender interface. If the request is part of a
// transaction, the TxnCoordSender adds the transaction to a map of active
// transactions and begins heartbeating it. Every subsequent request for the
// same transaction updates the lastUpdate timestamp to prevent live
// transactions from being considered abandoned and garbage collected.
// Read/write mutating requests have their key or key range added to the
// transaction's interval tree of key ranges for eventual cleanup via resolved
// write intents; they're tagged to an outgoing EndTransaction request, with
// the receiving replica in charge of resolving them.
func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	{
		// Start new or pick up active trace and embed its trace metadata into
		// header for use by RPC recipients. From here on, there's always an active
		// Trace, though its overhead is small unless it's sampled.
		sp := opentracing.SpanFromContext(ctx)
		if sp == nil {
			sp = tc.tracer.StartSpan(opTxnCoordSender)
			defer sp.Finish()
			ctx = opentracing.ContextWithSpan(ctx, sp)
		}
		// TODO(tschottdorf): To get rid of the spurious alloc below we need to
		// implement the carrier interface on ba.Header or make Span non-nullable,
		// both of which force all of ba on the Heap. It's already there, so may
		// not be a big deal, but ba should live on the stack. Also not easy to use
		// a buffer pool here since anything that goes into the RPC layer could be
		// used by goroutines we didn't wait for.
		if ba.Header.Trace == nil {
			ba.Header.Trace = &tracing.Span{}
		}
		if err := tc.tracer.Inject(sp, basictracer.Delegator, ba.Trace); err != nil {
			return nil, roachpb.NewError(err)
		}
	}

	startNS := tc.clock.PhysicalNow()

	if ba.Txn != nil {
		// If this request is part of a transaction...
		if err := tc.maybeBeginTxn(&ba); err != nil {
			return nil, roachpb.NewError(err)
		}
		txnID := *ba.Txn.ID
		// Verify that if this Transaction is not read-only, we have it on file.
		// If not, refuse further operations - the transaction was aborted due
		// to a timeout or the client must have issued a write on another
		// coordinator previously.
		if ba.Txn.Writing {
			tc.Lock()
			_, ok := tc.txns[txnID]
			tc.Unlock()
			if !ok {
				pErr := roachpb.NewErrorf("writing transaction timed out, was aborted, " +
					"or ran on multiple coordinators")
				return nil, pErr
			}
		}

		if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok {
			et := rArgs.(*roachpb.EndTransactionRequest)
			if len(et.Key) != 0 {
				return nil, roachpb.NewErrorf("EndTransaction must not have a Key set")
			}
			et.Key = ba.Txn.Key
			if len(et.IntentSpans) > 0 {
				// TODO(tschottdorf): it may be useful to allow this later.
				// That would be part of a possible plan to allow txns which
				// write on multiple coordinators.
				return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction")
			}
			tc.Lock()
			txnMeta, metaOK := tc.txns[txnID]
			{
				// Populate et.IntentSpans, taking into account both existing
				// writes (if any) and new writes in this batch, and taking
				// care to perform proper deduplication.
				var keys interval.RangeGroup
				if metaOK {
					keys = txnMeta.keys
				} else {
					keys = interval.NewRangeTree()
				}
				ba.IntentSpanIterate(func(key, endKey roachpb.Key) {
					addKeyRange(keys, key, endKey)
				})
				et.IntentSpans = collectIntentSpans(keys)
			}
			tc.Unlock()

			if len(et.IntentSpans) > 0 {
				// All good, proceed.
			} else if !metaOK {
				// If we don't have the transaction, then this must be a retry
				// by the client. We can no longer reconstruct a correct
				// request so we must fail.
				//
				// TODO(bdarnell): if we had a GetTransactionStatus API then
				// we could lookup the transaction and return either nil or
				// TransactionAbortedError instead of this ambivalent error.
				return nil, roachpb.NewErrorf("transaction is already committed or aborted")
			}
			if len(et.IntentSpans) == 0 {
				// If there aren't any intents, then there's factually no
				// transaction to end. Read-only txns have all of their state in
				// the client.
				return nil, roachpb.NewErrorf("cannot commit a read-only transaction")
			}
			if log.V(1) {
				for _, intent := range et.IntentSpans {
					log.Trace(ctx, fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey))
				}
			}
		}
	}

	// Send the command through wrapped sender, taking appropriate measures
	// on error.
	var br *roachpb.BatchResponse
	{
		var pErr *roachpb.Error
		br, pErr = tc.wrapped.Send(ctx, ba)

		if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok {
			// TODO(tschottdorf): needs to keep the trace.
			br, pErr = tc.resendWithTxn(ba)
		}

		if pErr = tc.updateState(startNS, ctx, ba, br, pErr); pErr != nil {
			log.Trace(ctx, fmt.Sprintf("error: %s", pErr))
			return nil, pErr
		}
	}

	if br.Txn == nil {
		return br, nil
	}

	if _, ok := ba.GetArg(roachpb.EndTransaction); !ok {
		return br, nil
	}
	// If the --linearizable flag is set, we want to make sure that
	// all the clocks in the system are past the commit timestamp
	// of the transaction. This is guaranteed if either
	// - the commit timestamp is MaxOffset behind startNS
	// - MaxOffset ns were spent in this function
	// when returning to the client. Below we choose the option
	// that involves less waiting, which is likely the first one
	// unless a transaction commits with an odd timestamp.
	if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS {
		startNS = tsNS
	}
	sleepNS := tc.clock.MaxOffset() -
		time.Duration(tc.clock.PhysicalNow()-startNS)
	if tc.linearizable && sleepNS > 0 {
		defer func() {
			if log.V(1) {
				log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.ID.Short(), util.TruncateDuration(sleepNS, time.Millisecond))
			}
			time.Sleep(sleepNS)
		}()
	}
	if br.Txn.Status != roachpb.PENDING {
		tc.cleanupTxn(ctx, *br.Txn)
	}
	return br, nil
}