// NewTimestampCache returns a new timestamp cache with supplied
// hybrid clock.
func NewTimestampCache(clock *hlc.Clock) *TimestampCache {
	tc := &TimestampCache{
		rCache: cache.NewIntervalCache(cache.Config{Policy: cache.CacheFIFO}),
		wCache: cache.NewIntervalCache(cache.Config{Policy: cache.CacheFIFO}),
	}
	tc.Clear(clock)
	tc.rCache.Config.ShouldEvict = tc.shouldEvict
	tc.wCache.Config.ShouldEvict = tc.shouldEvict
	return tc
}
// newTimestampCache returns a new timestamp cache with supplied
// hybrid clock.
func newTimestampCache(clock *hlc.Clock) *timestampCache {
	tc := &timestampCache{
		rCache:                cache.NewIntervalCache(cache.Config{Policy: cache.CacheFIFO}),
		wCache:                cache.NewIntervalCache(cache.Config{Policy: cache.CacheFIFO}),
		requests:              btree.New(btreeDegree),
		evictionSizeThreshold: defaultEvictionSizeThreshold,
	}
	tc.Clear(clock)
	tc.rCache.Config.ShouldEvict = tc.shouldEvict
	tc.wCache.Config.ShouldEvict = tc.shouldEvict
	return tc
}
// NewCommandQueue returns a new command queue.
func NewCommandQueue() *CommandQueue {
	cq := &CommandQueue{
		cache: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}),
	}
	cq.cache.OnEvicted = cq.onEvicted
	return cq
}
// sendOne sends a single call via the wrapped sender. If the call is
// part of a transaction, the TxnCoordSender adds the transaction to a
// map of active transactions and begins heartbeating it. Every
// subsequent call for the same transaction updates the lastUpdate
// timestamp to prevent live transactions from being considered
// abandoned and garbage collected. Read/write mutating requests have
// their key or key range added to the transaction's interval tree of
// key ranges for eventual cleanup via resolved write intents.
//
// On success, and if the call is part of a transaction, the affected
// key range is recorded as live intents for eventual cleanup upon
// transaction commit. Upon successful txn commit, initiates cleanup
// of intents.
func (tc *TxnCoordSender) sendOne(ctx context.Context, call proto.Call) {
	var startNS int64
	header := call.Args.Header()
	trace := tracer.FromCtx(ctx)
	var id string // optional transaction ID
	if header.Txn != nil {
		// If this call is part of a transaction...
		id = string(header.Txn.ID)
		// Verify that if this Transaction is not read-only, we have it on
		// file. If not, refuse writes - the client must have issued a write on
		// another coordinator previously.
		if header.Txn.Writing && proto.IsTransactionWrite(call.Args) {
			tc.Lock()
			_, ok := tc.txns[id]
			tc.Unlock()
			if !ok {
				call.Reply.Header().SetGoError(util.Errorf(
					"transaction must not write on multiple coordinators"))
				return
			}
		}

		// Set the timestamp to the original timestamp for read-only
		// commands and to the transaction timestamp for read/write
		// commands.
		if proto.IsReadOnly(call.Args) {
			header.Timestamp = header.Txn.OrigTimestamp
		} else {
			header.Timestamp = header.Txn.Timestamp
		}

		if args, ok := call.Args.(*proto.EndTransactionRequest); ok {
			// Remember when EndTransaction started in case we want to
			// be linearizable.
			startNS = tc.clock.PhysicalNow()
			// EndTransaction must have its key set to that of the txn.
			header.Key = header.Txn.Key
			if len(args.Intents) > 0 {
				// TODO(tschottdorf): it may be useful to allow this later.
				// That would be part of a possible plan to allow txns which
				// write on multiple coordinators.
				call.Reply.Header().SetGoError(util.Errorf(
					"client must not pass intents to EndTransaction"))
				return
			}
			tc.Lock()
			txnMeta, metaOK := tc.txns[id]
			if id != "" && metaOK {
				args.Intents = txnMeta.intents()
			}
			tc.Unlock()

			if !metaOK {
				// If we don't have the transaction, then this must be a retry
				// by the client. We can no longer reconstruct a correct
				// request so we must fail.
				//
				// TODO(bdarnell): if we had a GetTransactionStatus API then
				// we could lookup the transaction and return either nil or
				// TransactionAbortedError instead of this ambivalent error.
				call.Reply.Header().SetGoError(util.Errorf(
					"transaction is already committed or aborted"))
				return
			} else if len(args.Intents) == 0 {
				// If there aren't any intents, then there's factually no
				// transaction to end. Read-only txns have all of their state in
				// the client.
				call.Reply.Header().SetGoError(util.Errorf(
					"cannot commit a read-only transaction"))
				return
			}
		}
	}

	// Send the command through wrapped sender.
	tc.wrapped.Send(ctx, call)

	// For transactional calls, need to track & update the transaction.
	if header.Txn != nil {
		respHeader := call.Reply.Header()
		if respHeader.Txn == nil {
			// When empty, simply use the request's transaction.
			// This is expected: the Range doesn't bother copying unless the
			// object changes.
			respHeader.Txn = gogoproto.Clone(header.Txn).(*proto.Transaction)
		}
		tc.updateResponseTxn(header, respHeader)
	}

	if txn := call.Reply.Header().Txn; txn != nil {
		if !header.Txn.Equal(txn) {
			panic("transaction ID changed")
		}
		tc.Lock()
		txnMeta := tc.txns[id]
		// If this transactional command leaves transactional intents, add the key
		// or key range to the intents map. If the transaction metadata doesn't yet
		// exist, create it.
		if call.Reply.Header().GoError() == nil {
			if proto.IsTransactionWrite(call.Args) {
				if txnMeta == nil {
					txn.Writing = true
					trace.Event("coordinator spawns")
					txnMeta = &txnMetadata{
						txn:              *txn,
						keys:             cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}),
						firstUpdateNanos: tc.clock.PhysicalNow(),
						lastUpdateNanos:  tc.clock.PhysicalNow(),
						timeoutDuration:  tc.clientTimeout,
						txnEnd:           make(chan struct{}),
					}
					tc.txns[id] = txnMeta
					if !tc.stopper.RunAsyncTask(func() {
						tc.heartbeatLoop(id)
					}) {
						// The system is already draining and we can't start the
						// heartbeat. We refuse new transactions for now because
						// they're likely not going to have all intents committed.
						// In principle, we can relax this as needed though.
						call.Reply.Header().SetGoError(&proto.NodeUnavailableError{})
						tc.Unlock()
						tc.unregisterTxn(id)
						return
					}
				}
				txnMeta.addKeyRange(header.Key, header.EndKey)
			}
			// Update our record of this transaction.
			if txnMeta != nil {
				txnMeta.txn = *txn
				txnMeta.setLastUpdate(tc.clock.PhysicalNow())
			}
		}
		tc.Unlock()
	}

	// Cleanup intents and transaction map if end of transaction.
	switch t := call.Reply.Header().GoError().(type) {
	case *proto.TransactionStatusError:
		// Likely already committed or more obscure errors such as epoch or
		// timestamp regressions; consider it dead.
		tc.cleanupTxn(trace, t.Txn)
	case *proto.TransactionAbortedError:
		// If already aborted, cleanup the txn on this TxnCoordSender.
		tc.cleanupTxn(trace, t.Txn)
	case *proto.OpRequiresTxnError:
		// Run a one-off transaction with that single command.
		if log.V(1) {
			log.Infof("%s: auto-wrapping in txn and re-executing", call.Method())
		}
		// TODO(tschottdorf): this part is awkward. Consider resending here
		// without starting a new call, which is hard to trace. Plus, the
		// below depends on default configuration.
		tmpDB, err := client.Open(
			fmt.Sprintf("//%s?priority=%d",
				call.Args.Header().User, call.Args.Header().GetUserPriority()),
			client.SenderOpt(tc))
		if err != nil {
			log.Warning(err)
			return
		}
		call.Reply.Reset()
		if err := tmpDB.Txn(func(txn *client.Txn) error {
			txn.SetDebugName("auto-wrap", 0)
			b := &client.Batch{}
			b.InternalAddCall(call)
			return txn.CommitInBatch(b)
		}); err != nil {
			log.Warning(err)
		}
	case nil:
		if txn := call.Reply.Header().Txn; txn != nil {
			if _, ok := call.Args.(*proto.EndTransactionRequest); ok {
				// If the --linearizable flag is set, we want to make sure that
				// all the clocks in the system are past the commit timestamp
				// of the transaction. This is guaranteed if either
				// - the commit timestamp is MaxOffset behind startNS
				// - MaxOffset ns were spent in this function
				// when returning to the client. Below we choose the option
				// that involves less waiting, which is likely the first one
				// unless a transaction commits with an odd timestamp.
				if tsNS := txn.Timestamp.WallTime; startNS > tsNS {
					startNS = tsNS
				}
				sleepNS := tc.clock.MaxOffset() -
					time.Duration(tc.clock.PhysicalNow()-startNS)
				if tc.linearizable && sleepNS > 0 {
					defer func() {
						if log.V(1) {
							log.Infof("%v: waiting %s on EndTransaction for linearizability", txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond))
						}
						time.Sleep(sleepNS)
					}()
				}
				if txn.Status != proto.PENDING {
					tc.cleanupTxn(trace, *txn)
				}

			}
		}
	}
}
Exemple #5
0
// updateState updates the transaction state in both the success and
// error cases, applying those updates to the corresponding txnMeta
// object when adequate. It also updates certain errors with the
// updated transaction for use by client restarts.
func (tc *TxnCoordSender) updateState(ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error {
	trace := tracer.FromCtx(ctx)
	newTxn := &roachpb.Transaction{}
	newTxn.Update(ba.GetTxn())
	// TODO(tamird): remove this clone. It's currently needed to avoid race conditions.
	pErr = proto.Clone(pErr).(*roachpb.Error)
	err := pErr.GoError()
	// TODO(bdarnell): We're writing to errors here (and where using ErrorWithIndex);
	// since there's no concept of ownership copy-on-write is always preferable.
	switch t := err.(type) {
	case nil:
		newTxn.Update(br.Txn)
		// Move txn timestamp forward to response timestamp if applicable.
		// TODO(tschottdorf): see (*Replica).executeBatch and comments within.
		// Looks like this isn't necessary any more, nor did it prevent a bug
		// referenced in a TODO there.
		newTxn.Timestamp.Forward(br.Timestamp)
	case *roachpb.TransactionStatusError:
		// Likely already committed or more obscure errors such as epoch or
		// timestamp regressions; consider txn dead.
		defer tc.cleanupTxn(trace, t.Txn)
	case *roachpb.OpRequiresTxnError:
		panic("OpRequiresTxnError must not happen at this level")
	case *roachpb.ReadWithinUncertaintyIntervalError:
		// Mark the host as certain. See the protobuf comment for
		// Transaction.CertainNodes for details.
		if t.NodeID == 0 {
			panic("no replica set in header on uncertainty restart")
		}
		newTxn.Update(&t.Txn)
		newTxn.CertainNodes.Add(t.NodeID)
		// If the reader encountered a newer write within the uncertainty
		// interval, move the timestamp forward, just past that write or
		// up to MaxTimestamp, whichever comes first.
		candidateTS := newTxn.MaxTimestamp
		candidateTS.Backward(t.ExistingTimestamp.Add(0, 1))
		newTxn.Timestamp.Forward(candidateTS)
		newTxn.Restart(ba.GetUserPriority(), newTxn.Priority, newTxn.Timestamp)
		t.Txn = *newTxn
	case *roachpb.TransactionAbortedError:
		trace.SetError()
		newTxn.Update(&t.Txn)
		// Increase timestamp if applicable.
		newTxn.Timestamp.Forward(t.Txn.Timestamp)
		newTxn.Priority = t.Txn.Priority
		t.Txn = *newTxn
		// Clean up the freshly aborted transaction in defer(), avoiding a
		// race with the state update below.
		defer tc.cleanupTxn(trace, t.Txn)
	case *roachpb.TransactionPushError:
		newTxn.Update(t.Txn)
		// Increase timestamp if applicable, ensuring that we're
		// just ahead of the pushee.
		newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1))
		newTxn.Restart(ba.GetUserPriority(), t.PusheeTxn.Priority-1, newTxn.Timestamp)
		t.Txn = newTxn
	case *roachpb.TransactionRetryError:
		newTxn.Update(&t.Txn)
		newTxn.Restart(ba.GetUserPriority(), t.Txn.Priority, newTxn.Timestamp)
		t.Txn = *newTxn
	case roachpb.TransactionRestartError:
		// Assertion: The above cases should exhaust all ErrorDetails which
		// carry a Transaction.
		if pErr.Detail != nil {
			panic(fmt.Sprintf("unhandled TransactionRestartError %T", err))
		}
	default:
		trace.SetError()
	}

	return func() *roachpb.Error {
		if len(newTxn.ID) <= 0 {
			return pErr
		}
		id := string(newTxn.ID)
		tc.Lock()
		defer tc.Unlock()
		txnMeta := tc.txns[id]
		// For successful transactional requests, keep the written intents and
		// the updated transaction record to be sent along with the reply.
		// The transaction metadata is created with the first writing operation.
		// A tricky edge case is that of a transaction which "fails" on the
		// first writing request, but actually manages to write some intents
		// (for example, due to being multi-range). In this case, there will
		// be an error, but the transaction will be marked as Writing and the
		// coordinator must track the state, for the client's retry will be
		// performed with a Writing transaction which the coordinator rejects
		// unless it is tracking it (on top of it making sense to track it;
		// after all, it **has** laid down intents and only the coordinator
		// can augment a potential EndTransaction call).
		// consider re-using those.
		if intents := ba.GetIntents(); len(intents) > 0 && (err == nil || newTxn.Writing) {
			if txnMeta == nil {
				if !newTxn.Writing {
					panic("txn with intents marked as non-writing")
				}
				txnMeta = &txnMetadata{
					txn:              *newTxn,
					keys:             cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}),
					firstUpdateNanos: tc.clock.PhysicalNow(),
					lastUpdateNanos:  tc.clock.PhysicalNow(),
					timeoutDuration:  tc.clientTimeout,
					txnEnd:           make(chan struct{}),
				}
				tc.txns[id] = txnMeta
				// If the transaction is already over, there's no point in
				// launching a one-off coordinator which will shut down right
				// away. If we ended up here with an error, we'll always start
				// the coordinator - the transaction has laid down intents, so
				// we expect it to be committed/aborted at some point in the
				// future.
				if _, isEnding := ba.GetArg(roachpb.EndTransaction); err != nil || !isEnding {
					trace.Event("coordinator spawns")
					if !tc.stopper.RunAsyncTask(func() {
						tc.heartbeatLoop(id)
					}) {
						// The system is already draining and we can't start the
						// heartbeat. We refuse new transactions for now because
						// they're likely not going to have all intents committed.
						// In principle, we can relax this as needed though.
						tc.unregisterTxnLocked(id)
						return roachpb.NewError(&roachpb.NodeUnavailableError{})
					}
				}
			}
			for _, intent := range intents {
				txnMeta.addKeyRange(intent.Key, intent.EndKey)
			}
		}
		// Update our record of this transaction, even on error.
		if txnMeta != nil {
			txnMeta.txn = *newTxn
			if !txnMeta.txn.Writing {
				panic("tracking a non-writing txn")
			}
			txnMeta.setLastUpdate(tc.clock.PhysicalNow())
		}
		if err == nil {
			// For successful transactional requests, always send the updated txn
			// record back.
			br.Txn = newTxn
		}
		return pErr
	}()
}
// updateState updates the transaction state in both the success and
// error cases, applying those updates to the corresponding txnMeta
// object when adequate. It also updates certain errors with the
// updated transaction for use by client restarts.
func (tc *TxnCoordSender) updateState(ctx context.Context, ba proto.BatchRequest, br *proto.BatchResponse, pErr *proto.Error) *proto.Error {
	trace := tracer.FromCtx(ctx)
	newTxn := &proto.Transaction{}
	newTxn.Update(ba.GetTxn())
	err := pErr.GoError()
	switch t := err.(type) {
	case nil:
		newTxn.Update(br.GetTxn())
		// Move txn timestamp forward to response timestamp if applicable.
		// TODO(tschottdorf): see (*Replica).executeBatch and comments within.
		// Looks like this isn't necessary any more, nor did it prevent a bug
		// referenced in a TODO there.
		newTxn.Timestamp.Forward(br.Timestamp)
	case *proto.TransactionStatusError:
		// Likely already committed or more obscure errors such as epoch or
		// timestamp regressions; consider txn dead.
		defer tc.cleanupTxn(trace, t.Txn)
	case *proto.OpRequiresTxnError:
		// TODO(tschottdorf): range-spanning autowrap currently broken.
		panic("TODO(tschottdorf): disabled")
	case *proto.ReadWithinUncertaintyIntervalError:
		// Mark the host as certain. See the protobuf comment for
		// Transaction.CertainNodes for details.
		if t.NodeID == 0 {
			panic("no replica set in header on uncertainty restart")
		}
		newTxn.CertainNodes.Add(t.NodeID)
		// If the reader encountered a newer write within the uncertainty
		// interval, move the timestamp forward, just past that write or
		// up to MaxTimestamp, whichever comes first.
		candidateTS := newTxn.MaxTimestamp
		candidateTS.Backward(t.ExistingTimestamp.Add(0, 1))
		newTxn.Timestamp.Forward(candidateTS)
		newTxn.Restart(ba.GetUserPriority(), newTxn.Priority, newTxn.Timestamp)
		t.Txn = *newTxn
	case *proto.TransactionAbortedError:
		// Increase timestamp if applicable.
		newTxn.Timestamp.Forward(t.Txn.Timestamp)
		newTxn.Priority = t.Txn.Priority
		t.Txn = *newTxn
		// Clean up the freshly aborted transaction in defer(), avoiding a
		// race with the state update below.
		defer tc.cleanupTxn(trace, t.Txn)
	case *proto.TransactionPushError:
		// Increase timestamp if applicable, ensuring that we're
		// just ahead of the pushee.
		newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1))
		newTxn.Restart(ba.GetUserPriority(), t.PusheeTxn.Priority-1, newTxn.Timestamp)
		t.Txn = newTxn
	case *proto.TransactionRetryError:
		// Increase timestamp if applicable.
		newTxn.Timestamp.Forward(t.Txn.Timestamp)
		newTxn.Restart(ba.GetUserPriority(), t.Txn.Priority, newTxn.Timestamp)
		t.Txn = *newTxn
	case proto.TransactionRestartError:
		// Assertion: The above cases should exhaust all ErrorDetails which
		// carry a Transaction.
		if pErr.Detail != nil {
			panic(fmt.Sprintf("unhandled TransactionRestartError %T", err))
		}
	}

	return func() *proto.Error {
		if len(newTxn.ID) <= 0 {
			return pErr
		}
		id := string(newTxn.ID)
		tc.Lock()
		defer tc.Unlock()
		txnMeta := tc.txns[id]
		// For successful transactional requests, keep the written intents and
		// the updated transaction record to be sent along with the reply.
		// The transaction metadata is created with the first writing operation
		// TODO(tschottdorf): already computed the intents prior to sending,
		// consider re-using those.
		if intents := ba.GetIntents(); len(intents) > 0 && err == nil {
			if txnMeta == nil {
				newTxn.Writing = true
				txnMeta = &txnMetadata{
					txn:              *newTxn,
					keys:             cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}),
					firstUpdateNanos: tc.clock.PhysicalNow(),
					lastUpdateNanos:  tc.clock.PhysicalNow(),
					timeoutDuration:  tc.clientTimeout,
					txnEnd:           make(chan struct{}),
				}
				tc.txns[id] = txnMeta
				// If the transaction is already over, there's no point in
				// launching a one-off coordinator which will shut down right
				// away.
				if _, isEnding := ba.GetArg(proto.EndTransaction); !isEnding {
					trace.Event("coordinator spawns")
					if !tc.stopper.RunAsyncTask(func() {
						tc.heartbeatLoop(id)
					}) {
						// The system is already draining and we can't start the
						// heartbeat. We refuse new transactions for now because
						// they're likely not going to have all intents committed.
						// In principle, we can relax this as needed though.
						tc.unregisterTxnLocked(id)
						return proto.NewError(&proto.NodeUnavailableError{})
					}
				}
			}
			for _, intent := range intents {
				txnMeta.addKeyRange(intent.Key, intent.EndKey)
			}
		}
		// Update our record of this transaction, even on error.
		if txnMeta != nil {
			txnMeta.txn.Update(newTxn) // better to replace after #2300
			if !txnMeta.txn.Writing {
				panic("tracking a non-writing txn")
			}
			txnMeta.setLastUpdate(tc.clock.PhysicalNow())
		}
		if err == nil {
			// For successful transactional requests, always send the updated txn
			// record back.
			if br.Txn == nil {
				br.Txn = &proto.Transaction{}
			}
			*br.Txn = *newTxn
		}
		return pErr
	}()
}
// sendOne sends a single call via the wrapped sender. If the call is
// part of a transaction, the TxnCoordSender adds the transaction to a
// map of active transactions and begins heartbeating it. Every
// subsequent call for the same transaction updates the lastUpdate
// timestamp to prevent live transactions from being considered
// abandoned and garbage collected. Read/write mutating requests have
// their key or key range added to the transaction's interval tree of
// key ranges for eventual cleanup via resolved write intents.
//
// On success, and if the call is part of a transaction, the affected
// key range is recorded as live intents for eventual cleanup upon
// transaction commit. Upon successful txn commit, initiates cleanup
// of intents.
func (tc *TxnCoordSender) sendOne(call proto.Call) {
	var startNS int64
	header := call.Args.Header()
	// If this call is part of a transaction...
	if header.Txn != nil {
		// Set the timestamp to the original timestamp for read-only
		// commands and to the transaction timestamp for read/write
		// commands.
		if proto.IsReadOnly(call.Args) {
			header.Timestamp = header.Txn.OrigTimestamp
		} else {
			header.Timestamp = header.Txn.Timestamp
		}
		// EndTransaction must have its key set to that of the txn.
		if _, ok := call.Args.(*proto.EndTransactionRequest); ok {
			header.Key = header.Txn.Key
			// Remember when EndTransaction started in case we want to
			// be linearizable.
			startNS = tc.clock.PhysicalNow()
		}
	}

	// Send the command through wrapped sender.
	tc.wrapped.Send(context.TODO(), call)

	if header.Txn != nil {
		// If not already set, copy the request txn.
		if call.Reply.Header().Txn == nil {
			call.Reply.Header().Txn = gogoproto.Clone(header.Txn).(*proto.Transaction)
		}
		tc.updateResponseTxn(header, call.Reply.Header())
	}

	if txn := call.Reply.Header().Txn; txn != nil {
		tc.Lock()
		txnMeta := tc.txns[string(txn.ID)]
		// If this transactional command leaves transactional intents, add the key
		// or key range to the intents map. If the transaction metadata doesn't yet
		// exist, create it.
		if call.Reply.Header().GoError() == nil {
			if proto.IsTransactionWrite(call.Args) {
				if txnMeta == nil {
					txnMeta = &txnMetadata{
						txn:              *txn,
						keys:             cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}),
						firstUpdateNanos: tc.clock.PhysicalNow(),
						lastUpdateNanos:  tc.clock.PhysicalNow(),
						timeoutDuration:  tc.clientTimeout,
						txnEnd:           make(chan struct{}),
					}
					id := string(txn.ID)
					tc.txns[id] = txnMeta
					tc.heartbeat(id)
				}
				txnMeta.addKeyRange(header.Key, header.EndKey)
			}
			// Update our record of this transaction.
			if txnMeta != nil {
				txnMeta.txn = *txn
				txnMeta.setLastUpdate(tc.clock.PhysicalNow())
			}
		}
		tc.Unlock()
	}

	// Cleanup intents and transaction map if end of transaction.
	switch t := call.Reply.Header().GoError().(type) {
	case *proto.TransactionStatusError:
		// Likely already committed or more obscure errors such as epoch or
		// timestamp regressions; consider it dead.
		tc.cleanupTxn(t.Txn, nil)
	case *proto.TransactionAbortedError:
		// If already aborted, cleanup the txn on this TxnCoordSender.
		tc.cleanupTxn(t.Txn, nil)
	case *proto.OpRequiresTxnError:
		// Run a one-off transaction with that single command.
		if log.V(1) {
			log.Infof("%s: auto-wrapping in txn and re-executing", call.Method())
		}
		tmpDB, err := client.Open(
			fmt.Sprintf("//%s?priority=%d",
				call.Args.Header().User, call.Args.Header().GetUserPriority()),
			client.SenderOpt(tc))
		if err != nil {
			log.Warning(err)
			return
		}
		call.Reply.Reset()
		if err := tmpDB.Txn(func(txn *client.Txn) error {
			txn.SetDebugName("auto-wrap")
			b := &client.Batch{}
			b.InternalAddCall(call)
			return txn.Commit(b)
		}); err != nil {
			log.Warning(err)
		}
	case nil:
		var resolved []proto.Key
		if txn := call.Reply.Header().Txn; txn != nil {
			if _, ok := call.Args.(*proto.EndTransactionRequest); ok {
				// If the --linearizable flag is set, we want to make sure that
				// all the clocks in the system are past the commit timestamp
				// of the transaction. This is guaranteed if either
				// - the commit timestamp is MaxOffset behind startNS
				// - MaxOffset ns were spent in this function
				// when returning to the client. Below we choose the option
				// that involves less waiting, which is likely the first one
				// unless a transaction commits with an odd timestamp.
				if tsNS := txn.Timestamp.WallTime; startNS > tsNS {
					startNS = tsNS
				}
				sleepNS := tc.clock.MaxOffset() -
					time.Duration(tc.clock.PhysicalNow()-startNS)
				if tc.linearizable && sleepNS > 0 {
					defer func() {
						if log.V(1) {
							log.Infof("%v: waiting %s on EndTransaction for linearizability", txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond))
						}
						time.Sleep(sleepNS)
					}()
				}
				resolved = call.Reply.(*proto.EndTransactionResponse).Resolved
				if txn.Status != proto.PENDING {
					tc.cleanupTxn(*txn, resolved)
				}

			}
		}
	}
}
// updateState updates the transaction state in both the success and
// error cases, applying those updates to the corresponding txnMeta
// object when adequate. It also updates certain errors with the
// updated transaction for use by client restarts.
func (tc *TxnCoordSender) updateState(ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error {
	sp, cleanupSp := tracing.SpanFromContext(opTxnCoordSender, tc.tracer, ctx)
	defer cleanupSp()

	newTxn := &roachpb.Transaction{}
	newTxn.Update(ba.Txn)
	if pErr == nil {
		newTxn.Update(br.Txn)
	} else {
		newTxn.Update(pErr.GetTxn())
	}

	// If the request was successful but we're in a transaction which needs to
	// restart but doesn't know it yet, let it restart now (as opposed to
	// waiting until EndTransaction).
	if pErr == nil && newTxn.Isolation == roachpb.SERIALIZABLE &&
		!newTxn.OrigTimestamp.Equal(newTxn.Timestamp) {
		pErr = roachpb.NewErrorWithTxn(roachpb.NewTransactionRetryError(), br.Txn)
		br = nil
	}

	switch t := pErr.GetDetail().(type) {
	case nil:
		// Move txn timestamp forward to response timestamp if applicable.
		// TODO(tschottdorf): see (*Replica).executeBatch and comments within.
		// Looks like this isn't necessary any more, nor did it prevent a bug
		// referenced in a TODO there.
		newTxn.Timestamp.Forward(br.Timestamp)
	case *roachpb.TransactionStatusError:
		// Likely already committed or more obscure errors such as epoch or
		// timestamp regressions; consider txn dead.
		defer tc.cleanupTxn(sp, *pErr.GetTxn())
	case *roachpb.OpRequiresTxnError:
		panic("OpRequiresTxnError must not happen at this level")
	case *roachpb.ReadWithinUncertaintyIntervalError:
		// If the reader encountered a newer write within the uncertainty
		// interval, we advance the txn's timestamp just past the last observed
		// timestamp from the node.
		restartTS, ok := newTxn.GetObservedTimestamp(pErr.OriginNode)
		if !ok {
			pErr = roachpb.NewError(util.Errorf("no observed timestamp for node %d found on uncertainty restart", pErr.OriginNode))
		} else {
			newTxn.Timestamp.Forward(restartTS)
			newTxn.Restart(ba.UserPriority, newTxn.Priority, newTxn.Timestamp)
		}
	case *roachpb.TransactionAbortedError:
		// Increase timestamp if applicable.
		newTxn.Timestamp.Forward(pErr.GetTxn().Timestamp)
		newTxn.Priority = pErr.GetTxn().Priority
		// Clean up the freshly aborted transaction in defer(), avoiding a
		// race with the state update below.
		defer tc.cleanupTxn(sp, *newTxn)
	case *roachpb.TransactionPushError:
		// Increase timestamp if applicable, ensuring that we're
		// just ahead of the pushee.
		newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1))
		newTxn.Restart(ba.UserPriority, t.PusheeTxn.Priority-1, newTxn.Timestamp)
	case *roachpb.TransactionRetryError:
		newTxn.Restart(ba.UserPriority, pErr.GetTxn().Priority, newTxn.Timestamp)
	default:
		if pErr.GetTxn() != nil {
			if pErr.CanRetry() {
				panic("Retryable internal error must not happen at this level")
			} else {
				// Do not clean up the transaction here since the client might still
				// want to continue the transaction. For example, a client might
				// continue its transaction after receiving ConditionFailedError, which
				// can come from a unique index violation.
			}
		}
	}

	if pErr != nil && pErr.GetTxn() != nil {
		// Avoid changing existing errors because sometimes they escape into
		// goroutines and then there are races. Fairly sure there isn't one
		// here, but better safe than sorry.
		pErrShallow := *pErr
		pErrShallow.SetTxn(newTxn)
		pErr = &pErrShallow
	}

	if newTxn.ID == nil {
		return pErr
	}
	txnID := *newTxn.ID
	tc.Lock()
	defer tc.Unlock()
	txnMeta := tc.txns[txnID]
	// For successful transactional requests, keep the written intents and
	// the updated transaction record to be sent along with the reply.
	// The transaction metadata is created with the first writing operation.
	// A tricky edge case is that of a transaction which "fails" on the
	// first writing request, but actually manages to write some intents
	// (for example, due to being multi-range). In this case, there will
	// be an error, but the transaction will be marked as Writing and the
	// coordinator must track the state, for the client's retry will be
	// performed with a Writing transaction which the coordinator rejects
	// unless it is tracking it (on top of it making sense to track it;
	// after all, it **has** laid down intents and only the coordinator
	// can augment a potential EndTransaction call). See #3303.
	intents := ba.GetIntentSpans()
	if len(intents) > 0 && (pErr == nil || newTxn.Writing) {
		if txnMeta == nil {
			if !newTxn.Writing {
				panic("txn with intents marked as non-writing")
			}
			// If the transaction is already over, there's no point in
			// launching a one-off coordinator which will shut down right
			// away. If we ended up here with an error, we'll always start
			// the coordinator - the transaction has laid down intents, so
			// we expect it to be committed/aborted at some point in the
			// future.
			if _, isEnding := ba.GetArg(roachpb.EndTransaction); pErr != nil || !isEnding {
				sp.LogEvent("coordinator spawns")
				txnMeta = &txnMetadata{
					txn:              *newTxn,
					keys:             cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}),
					firstUpdateNanos: tc.clock.PhysicalNow(),
					lastUpdateNanos:  tc.clock.PhysicalNow(),
					timeoutDuration:  tc.clientTimeout,
					txnEnd:           make(chan struct{}),
				}
				tc.txns[txnID] = txnMeta

				if !tc.stopper.RunAsyncTask(func() {
					tc.heartbeatLoop(txnID)
				}) {
					// The system is already draining and we can't start the
					// heartbeat. We refuse new transactions for now because
					// they're likely not going to have all intents committed.
					// In principle, we can relax this as needed though.
					tc.unregisterTxnLocked(txnID)
					return roachpb.NewError(&roachpb.NodeUnavailableError{})
				}
			}
		}
	}
	// Update our record of this transaction, even on error.
	if txnMeta != nil {
		txnMeta.txn = *newTxn
		if !txnMeta.txn.Writing {
			panic("tracking a non-writing txn")
		}
		txnMeta.setLastUpdate(tc.clock.PhysicalNow())
		// Adding the intents even on error reduces the likelihood of dangling
		// intents blocking concurrent writers for extended periods of time.
		// See #3346.
		for _, intent := range intents {
			txnMeta.addKeyRange(intent.Key, intent.EndKey)
		}
	}
	if pErr == nil {
		// For successful transactional requests, always send the updated txn
		// record back.
		br.Txn = newTxn
	}
	return pErr
}