示例#1
0
// updateState updates the transaction state in both the success and
// error cases, applying those updates to the corresponding txnMeta
// object when adequate. It also updates certain errors with the
// updated transaction for use by client restarts.
func (tc *TxnCoordSender) updateState(ctx context.Context, ba proto.BatchRequest, br *proto.BatchResponse, pErr *proto.Error) *proto.Error {
	trace := tracer.FromCtx(ctx)
	newTxn := &proto.Transaction{}
	newTxn.Update(ba.GetTxn())
	err := pErr.GoError()
	switch t := err.(type) {
	case nil:
		newTxn.Update(br.GetTxn())
		// Move txn timestamp forward to response timestamp if applicable.
		// TODO(tschottdorf): see (*Replica).executeBatch and comments within.
		// Looks like this isn't necessary any more, nor did it prevent a bug
		// referenced in a TODO there.
		newTxn.Timestamp.Forward(br.Timestamp)
	case *proto.TransactionStatusError:
		// Likely already committed or more obscure errors such as epoch or
		// timestamp regressions; consider txn dead.
		defer tc.cleanupTxn(trace, t.Txn)
	case *proto.OpRequiresTxnError:
		// TODO(tschottdorf): range-spanning autowrap currently broken.
		panic("TODO(tschottdorf): disabled")
	case *proto.ReadWithinUncertaintyIntervalError:
		// Mark the host as certain. See the protobuf comment for
		// Transaction.CertainNodes for details.
		if t.NodeID == 0 {
			panic("no replica set in header on uncertainty restart")
		}
		newTxn.CertainNodes.Add(t.NodeID)
		// If the reader encountered a newer write within the uncertainty
		// interval, move the timestamp forward, just past that write or
		// up to MaxTimestamp, whichever comes first.
		candidateTS := newTxn.MaxTimestamp
		candidateTS.Backward(t.ExistingTimestamp.Add(0, 1))
		newTxn.Timestamp.Forward(candidateTS)
		newTxn.Restart(ba.GetUserPriority(), newTxn.Priority, newTxn.Timestamp)
		t.Txn = *newTxn
	case *proto.TransactionAbortedError:
		// Increase timestamp if applicable.
		newTxn.Timestamp.Forward(t.Txn.Timestamp)
		newTxn.Priority = t.Txn.Priority
		t.Txn = *newTxn
		// Clean up the freshly aborted transaction in defer(), avoiding a
		// race with the state update below.
		defer tc.cleanupTxn(trace, t.Txn)
	case *proto.TransactionPushError:
		// Increase timestamp if applicable, ensuring that we're
		// just ahead of the pushee.
		newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1))
		newTxn.Restart(ba.GetUserPriority(), t.PusheeTxn.Priority-1, newTxn.Timestamp)
		t.Txn = newTxn
	case *proto.TransactionRetryError:
		// Increase timestamp if applicable.
		newTxn.Timestamp.Forward(t.Txn.Timestamp)
		newTxn.Restart(ba.GetUserPriority(), t.Txn.Priority, newTxn.Timestamp)
		t.Txn = *newTxn
	case proto.TransactionRestartError:
		// Assertion: The above cases should exhaust all ErrorDetails which
		// carry a Transaction.
		if pErr.Detail != nil {
			panic(fmt.Sprintf("unhandled TransactionRestartError %T", err))
		}
	}

	return func() *proto.Error {
		if len(newTxn.ID) <= 0 {
			return pErr
		}
		id := string(newTxn.ID)
		tc.Lock()
		defer tc.Unlock()
		txnMeta := tc.txns[id]
		// For successful transactional requests, keep the written intents and
		// the updated transaction record to be sent along with the reply.
		// The transaction metadata is created with the first writing operation
		// TODO(tschottdorf): already computed the intents prior to sending,
		// consider re-using those.
		if intents := ba.GetIntents(); len(intents) > 0 && err == nil {
			if txnMeta == nil {
				newTxn.Writing = true
				txnMeta = &txnMetadata{
					txn:              *newTxn,
					keys:             cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}),
					firstUpdateNanos: tc.clock.PhysicalNow(),
					lastUpdateNanos:  tc.clock.PhysicalNow(),
					timeoutDuration:  tc.clientTimeout,
					txnEnd:           make(chan struct{}),
				}
				tc.txns[id] = txnMeta
				// If the transaction is already over, there's no point in
				// launching a one-off coordinator which will shut down right
				// away.
				if _, isEnding := ba.GetArg(proto.EndTransaction); !isEnding {
					trace.Event("coordinator spawns")
					if !tc.stopper.RunAsyncTask(func() {
						tc.heartbeatLoop(id)
					}) {
						// The system is already draining and we can't start the
						// heartbeat. We refuse new transactions for now because
						// they're likely not going to have all intents committed.
						// In principle, we can relax this as needed though.
						tc.unregisterTxnLocked(id)
						return proto.NewError(&proto.NodeUnavailableError{})
					}
				}
			}
			for _, intent := range intents {
				txnMeta.addKeyRange(intent.Key, intent.EndKey)
			}
		}
		// Update our record of this transaction, even on error.
		if txnMeta != nil {
			txnMeta.txn.Update(newTxn) // better to replace after #2300
			if !txnMeta.txn.Writing {
				panic("tracking a non-writing txn")
			}
			txnMeta.setLastUpdate(tc.clock.PhysicalNow())
		}
		if err == nil {
			// For successful transactional requests, always send the updated txn
			// record back.
			if br.Txn == nil {
				br.Txn = &proto.Transaction{}
			}
			*br.Txn = *newTxn
		}
		return pErr
	}()
}
示例#2
0
// Send implements the batch.Sender interface. If the request is part of a
// transaction, the TxnCoordSender adds the transaction to a map of active
// transactions and begins heartbeating it. Every subsequent request for the
// same transaction updates the lastUpdate timestamp to prevent live
// transactions from being considered abandoned and garbage collected.
// Read/write mutating requests have their key or key range added to the
// transaction's interval tree of key ranges for eventual cleanup via resolved
// write intents; they're tagged to an outgoing EndTransaction request, with
// the receiving replica in charge of resolving them.
func (tc *TxnCoordSender) Send(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) {
	tc.maybeBeginTxn(&ba)
	ba.CmdID = ba.GetOrCreateCmdID(tc.clock.PhysicalNow())
	var startNS int64

	// This is the earliest point at which the request has a ClientCmdID and/or
	// TxnID (if applicable). Begin a Trace which follows this request.
	trace := tc.tracer.NewTrace(&ba)
	defer trace.Finalize()
	// TODO(tschottdorf): always "Batch"
	defer trace.Epoch(fmt.Sprintf("sending %s", ba.Method()))()
	ctx = tracer.ToCtx(ctx, trace)

	// TODO(tschottdorf): No looping through the batch will be necessary once
	// we've eliminated all the redundancies.
	for _, arg := range ba.Requests {
		trace.Event(fmt.Sprintf("%T", arg.GetValue()))
		if err := updateForBatch(arg.GetInner(), ba.RequestHeader); err != nil {
			return nil, proto.NewError(err)
		}
	}

	var id string // optional transaction ID
	if ba.Txn != nil {
		// If this request is part of a transaction...
		id = string(ba.Txn.ID)
		// Verify that if this Transaction is not read-only, we have it on
		// file. If not, refuse writes - the client must have issued a write on
		// another coordinator previously.
		if ba.Txn.Writing && ba.IsTransactionWrite() {
			tc.Lock()
			_, ok := tc.txns[id]
			tc.Unlock()
			if !ok {
				return nil, proto.NewError(util.Errorf("transaction must not write on multiple coordinators"))
			}
		}

		// Set the timestamp to the original timestamp for read-only
		// commands and to the transaction timestamp for read/write
		// commands.
		if ba.IsReadOnly() {
			ba.Timestamp = ba.Txn.OrigTimestamp
		} else {
			ba.Timestamp = ba.Txn.Timestamp
		}

		if rArgs, ok := ba.GetArg(proto.EndTransaction); ok {
			et := rArgs.(*proto.EndTransactionRequest)
			// Remember when EndTransaction started in case we want to
			// be linearizable.
			startNS = tc.clock.PhysicalNow()
			if len(et.Intents) > 0 {
				// TODO(tschottdorf): it may be useful to allow this later.
				// That would be part of a possible plan to allow txns which
				// write on multiple coordinators.
				return nil, proto.NewError(util.Errorf("client must not pass intents to EndTransaction"))
			}
			if len(et.Key) != 0 {
				return nil, proto.NewError(util.Errorf("EndTransaction must not have a Key set"))
			}
			et.Key = ba.Txn.Key

			tc.Lock()
			txnMeta, metaOK := tc.txns[id]
			if id != "" && metaOK {
				et.Intents = txnMeta.intents()
			}
			tc.Unlock()

			if intents := ba.GetIntents(); len(intents) > 0 {
				// Writes in Batch, so EndTransaction is fine. Should add
				// outstanding intents to EndTransaction, though.
				// TODO(tschottdorf): possible issues when the batch fails,
				// but the intents have been added anyways.
				// TODO(tschottdorf): some of these intents may be covered
				// by others, for example {[a,b), a}). This can lead to
				// some extra requests when those are non-local to the txn
				// record. But it doesn't seem worth optimizing now.
				et.Intents = append(et.Intents, intents...)
			} else if !metaOK {
				// If we don't have the transaction, then this must be a retry
				// by the client. We can no longer reconstruct a correct
				// request so we must fail.
				//
				// TODO(bdarnell): if we had a GetTransactionStatus API then
				// we could lookup the transaction and return either nil or
				// TransactionAbortedError instead of this ambivalent error.
				return nil, proto.NewError(util.Errorf("transaction is already committed or aborted"))
			}
			if len(et.Intents) == 0 {
				// If there aren't any intents, then there's factually no
				// transaction to end. Read-only txns have all of their state in
				// the client.
				return nil, proto.NewError(util.Errorf("cannot commit a read-only transaction"))
			}
			// TODO(tschottdorf): V(1)
			for _, intent := range et.Intents {
				trace.Event(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey))
			}
		}
	}

	// Send the command through wrapped sender, taking appropriate measures
	// on error.
	var br *proto.BatchResponse
	{
		var pErr *proto.Error
		br, pErr = tc.wrapped.Send(ctx, ba)

		if _, ok := pErr.GoError().(*proto.OpRequiresTxnError); ok {
			br, pErr = tc.resendWithTxn(ba)
		}

		if pErr := tc.updateState(ctx, ba, br, pErr); pErr != nil {
			return nil, pErr
		}
	}

	if br.Txn == nil {
		return br, nil
	}

	if _, ok := ba.GetArg(proto.EndTransaction); !ok {
		return br, nil
	}
	// If the --linearizable flag is set, we want to make sure that
	// all the clocks in the system are past the commit timestamp
	// of the transaction. This is guaranteed if either
	// - the commit timestamp is MaxOffset behind startNS
	// - MaxOffset ns were spent in this function
	// when returning to the client. Below we choose the option
	// that involves less waiting, which is likely the first one
	// unless a transaction commits with an odd timestamp.
	if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS {
		startNS = tsNS
	}
	sleepNS := tc.clock.MaxOffset() -
		time.Duration(tc.clock.PhysicalNow()-startNS)
	if tc.linearizable && sleepNS > 0 {
		defer func() {
			if log.V(1) {
				log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond))
			}
			time.Sleep(sleepNS)
		}()
	}
	if br.Txn.Status != proto.PENDING {
		tc.cleanupTxn(trace, *br.Txn)
	}
	return br, nil
}