Example #1
0
// Send implements the client.Sender interface.
func (rls *retryableLocalSender) Send(_ context.Context, call proto.Call) {
	// Instant retry to handle the case of a range split, which is
	// exposed here as a RangeKeyMismatchError.
	retryOpts := retry.Options{
		Tag: fmt.Sprintf("routing %s locally", call.Method()),
	}
	// In local tests, the RPCs are not actually sent over the wire. We
	// need to clone the Txn in order to avoid unexpected sharing
	// between TxnCoordSender and client.Txn.
	if header := call.Args.Header(); header.Txn != nil {
		header.Txn = gogoproto.Clone(header.Txn).(*proto.Transaction)
	}
	err := retry.WithBackoff(retryOpts, func() (retry.Status, error) {
		call.Reply.Header().Error = nil
		rls.LocalSender.Send(context.TODO(), call)
		// Check for range key mismatch error (this could happen if
		// range was split between lookup and execution). In this case,
		// reset header.Replica and engage retry loop.
		if err := call.Reply.Header().GoError(); err != nil {
			if _, ok := err.(*proto.RangeKeyMismatchError); ok {
				// Clear request replica.
				call.Args.Header().Replica = proto.Replica{}
				return retry.Continue, err
			}
		}
		return retry.Break, nil
	})
	if err != nil {
		panic(fmt.Sprintf("local sender did not succeed: %s", err))
	}
}
Example #2
0
func (db *testSender) sendOne(call proto.Call) {
	switch call.Args.(type) {
	case *proto.EndTransactionRequest:
		safeSetGoError(call.Reply, util.Errorf("%s method not supported", call.Method()))
		return
	}
	// Lookup range and direct request.
	header := call.Args.Header()
	if rng := db.store.LookupRange(header.Key, header.EndKey); rng != nil {
		header.RangeID = rng.Desc().RangeID
		replica := rng.GetReplica()
		if replica == nil {
			safeSetGoError(call.Reply, util.Errorf("own replica missing in range"))
		}
		header.Replica = *replica
		reply, err := db.store.ExecuteCmd(context.Background(), call.Args)
		if reply != nil {
			gogoproto.Merge(call.Reply, reply)
		}
		if call.Reply.Header().Error != nil {
			panic(proto.ErrorUnexpectedlySet)
		}
		if err != nil {
			call.Reply.Header().SetGoError(err)
		}
	} else {
		safeSetGoError(call.Reply, proto.NewRangeKeyMismatchError(header.Key, header.EndKey, nil))
	}
}
Example #3
0
// Send implements the client.Sender interface. If the call is part
// of a transaction, the coordinator will initialize the transaction
// if it's not nil but has an empty ID.
func (tc *TxnCoordSender) Send(ctx context.Context, call proto.Call) {
	header := call.Args.Header()
	tc.maybeBeginTxn(header)
	header.CmdID = header.GetOrCreateCmdID(tc.clock.PhysicalNow())

	// This is the earliest point at which the request has a ClientCmdID and/or
	// TxnID (if applicable). Begin a Trace which follows this request.
	trace := tc.tracer.NewTrace(call.Args.Header())
	defer trace.Finalize()
	defer trace.Epoch(fmt.Sprintf("sending %s", call.Method()))()
	defer func() {
		if err := call.Reply.Header().GoError(); err != nil {
			trace.Event(fmt.Sprintf("reply error: %T", err))
		}
	}()
	ctx = tracer.ToCtx(ctx, trace)

	// Process batch specially; otherwise, send via wrapped sender.
	switch args := call.Args.(type) {
	case *proto.BatchRequest:
		trace.Event("batch processing")
		tc.sendBatch(ctx, args, call.Reply.(*proto.BatchResponse))
	default:
		// TODO(tschottdorf): should treat all calls as Batch. After all, that
		// will be almost all calls.
		tc.sendOne(ctx, call)
	}
}
Example #4
0
// sendBatch unrolls a batched command and sends each constituent
// command in parallel.
// TODO(tschottdorf): modify sendBatch so that it sends truly parallel requests
// when outside of a Transaction. This can then be used to address the TODO in
// (*TxnCoordSender).resolve().
func (tc *TxnCoordSender) sendBatch(ctx context.Context, batchArgs *proto.BatchRequest, batchReply *proto.BatchResponse) {
	// Prepare the calls by unrolling the batch. If the batchReply is
	// pre-initialized with replies, use those; otherwise create replies
	// as needed.
	// TODO(spencer): send calls in parallel.
	batchReply.Txn = batchArgs.Txn
	for i := range batchArgs.Requests {
		args := batchArgs.Requests[i].GetValue().(proto.Request)
		if err := updateForBatch(args, batchArgs.RequestHeader); err != nil {
			batchReply.Header().SetGoError(err)
			return
		}
		call := proto.Call{Args: args}
		// Create a reply from the method type and add to batch response.
		if i >= len(batchReply.Responses) {
			call.Reply = args.CreateReply()
			batchReply.Add(call.Reply)
		} else {
			call.Reply = batchReply.Responses[i].GetValue().(proto.Response)
		}
		tc.sendOne(ctx, call)
		// Amalgamate transaction updates and propagate first error, if applicable.
		if batchReply.Txn != nil {
			batchReply.Txn.Update(call.Reply.Header().Txn)
		}
		if call.Reply.Header().Error != nil {
			batchReply.Error = call.Reply.Header().Error
			return
		}
	}
}
Example #5
0
// post posts the call using the HTTP client. The call's method is
// appended to KVDBEndpoint and set as the URL path. The call's arguments
// are protobuf-serialized and written as the POST body. The content
// type is set to application/x-protobuf.
//
// On success, the response body is unmarshalled into call.Reply.
func (s *httpSender) post(call proto.Call) error {
	retryOpts := s.retryOpts
	retryOpts.Tag = fmt.Sprintf("%s %s", s.context.RequestScheme(), call.Method())

	// Marshal the args into a request body.
	body, err := gogoproto.Marshal(call.Args)
	if err != nil {
		return err
	}

	url := s.context.RequestScheme() + "://" + s.server + KVDBEndpoint + call.Method().String()

	return retry.WithBackoff(retryOpts, func() (retry.Status, error) {
		req, err := http.NewRequest("POST", url, bytes.NewReader(body))
		if err != nil {
			return retry.Break, err
		}
		req.Header.Add(util.ContentTypeHeader, util.ProtoContentType)
		req.Header.Add(util.AcceptHeader, util.ProtoContentType)
		req.Header.Add(util.AcceptEncodingHeader, util.SnappyEncoding)

		resp, err := s.client.Do(req)
		if err != nil {
			return retry.Continue, err
		}

		defer resp.Body.Close()

		switch resp.StatusCode {
		case http.StatusOK:
			// We're cool.
		case http.StatusServiceUnavailable, http.StatusGatewayTimeout, StatusTooManyRequests:
			// Retry on service unavailable and request timeout.
			// TODO(spencer): consider respecting the Retry-After header for
			// backoff / retry duration.
			return retry.Continue, errors.New(resp.Status)
		default:
			// Can't recover from all other errors.
			return retry.Break, errors.New(resp.Status)
		}

		if resp.Header.Get(util.ContentEncodingHeader) == util.SnappyEncoding {
			resp.Body = &snappyReader{body: resp.Body}
		}

		b, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			return retry.Continue, err
		}

		if err := gogoproto.Unmarshal(b, call.Reply); err != nil {
			return retry.Continue, err
		}

		return retry.Break, nil
	})
}
Example #6
0
// Send implements the client.Sender interface. The store is looked
// up from the store map if specified by header.Replica; otherwise,
// the command is being executed locally, and the replica is
// determined via lookup through each store's LookupRange method.
func (ls *LocalSender) Send(ctx context.Context, call proto.Call) {
	var err error
	var store *storage.Store

	trace := tracer.FromCtx(ctx)

	// If we aren't given a Replica, then a little bending over
	// backwards here. This case applies exclusively to unittests.
	header := call.Args.Header()
	if header.RaftID == 0 || header.Replica.StoreID == 0 {
		var repl *proto.Replica
		var raftID proto.RaftID
		raftID, repl, err = ls.lookupReplica(header.Key, header.EndKey)
		if err == nil {
			header.RaftID = raftID
			header.Replica = *repl
		}
	}
	ctx = log.Add(ctx,
		log.Method, call.Method(),
		log.Key, header.Key,
		log.RaftID, header.RaftID)

	if err == nil {
		store, err = ls.GetStore(header.Replica.StoreID)
	}
	var reply proto.Response
	if err == nil {
		// For calls that read data within a txn, we can avoid uncertainty
		// related retries in certain situations. If the node is in
		// "CertainNodes", we need not worry about uncertain reads any
		// more. Setting MaxTimestamp=Timestamp for the operation
		// accomplishes that. See proto.Transaction.CertainNodes for details.
		if header.Txn != nil && header.Txn.CertainNodes.Contains(header.Replica.NodeID) {
			// MaxTimestamp = Timestamp corresponds to no clock uncertainty.
			trace.Event("read has no clock uncertainty")
			header.Txn.MaxTimestamp = header.Txn.Timestamp
		}
		reply, err = store.ExecuteCmd(ctx, call.Args)
	}
	if reply != nil {
		gogoproto.Merge(call.Reply, reply)
	}
	if call.Reply.Header().Error != nil {
		panic(proto.ErrorUnexpectedlySet)
	}
	if err != nil {
		call.Reply.Header().SetGoError(err)
	}
}
Example #7
0
// sendBatch unrolls a batched command and sends each constituent
// command in parallel.
func (tc *TxnCoordSender) sendBatch(batchArgs *proto.InternalBatchRequest, batchReply *proto.InternalBatchResponse) {
	// Prepare the calls by unrolling the batch. If the batchReply is
	// pre-initialized with replies, use those; otherwise create replies
	// as needed.
	// TODO(spencer): send calls in parallel.
	batchReply.Txn = batchArgs.Txn
	for i := range batchArgs.Requests {
		args := batchArgs.Requests[i].GetValue().(proto.Request)
		call := proto.Call{Args: args}
		// Disallow transaction, user and priority on individual calls, unless
		// equal.
		if args.Header().User != "" && args.Header().User != batchArgs.User {
			batchReply.Header().SetGoError(util.Error("cannot have individual user on call in batch"))
			return
		}
		args.Header().User = batchArgs.User
		if args.Header().UserPriority != nil && args.Header().GetUserPriority() != batchArgs.GetUserPriority() {
			batchReply.Header().SetGoError(util.Error("cannot have individual user priority on call in batch"))
			return
		}
		args.Header().UserPriority = batchArgs.UserPriority
		if txn := args.Header().Txn; txn != nil && !txn.Equal(batchArgs.Txn) {
			batchReply.Header().SetGoError(util.Error("cannot have individual transactional call in batch"))
			return
		}
		// Propagate batch Txn to each call.
		args.Header().Txn = batchArgs.Txn

		// Create a reply from the method type and add to batch response.
		if i >= len(batchReply.Responses) {
			call.Reply = args.CreateReply()
			batchReply.Add(call.Reply)
		} else {
			call.Reply = batchReply.Responses[i].GetValue().(proto.Response)
		}
		tc.sendOne(call)
		// Amalgamate transaction updates and propagate first error, if applicable.
		if batchReply.Txn != nil {
			batchReply.Txn.Update(call.Reply.Header().Txn)
		}
		if call.Reply.Header().Error != nil {
			batchReply.Error = call.Reply.Header().Error
			return
		}
	}
}
// Send implements the client.Sender interface. If the call is part
// of a transaction, the coordinator will initialize the transaction
// if it's not nil but has an empty ID.
func (tc *TxnCoordSender) Send(ctx context.Context, call proto.Call) {
	header := call.Args.Header()
	tc.maybeBeginTxn(header)
	header.CmdID = header.GetOrCreateCmdID(tc.clock.PhysicalNow())

	// This is the earliest point at which the request has a ClientCmdID and/or
	// TxnID (if applicable). Begin a Trace which follows this request.
	trace := tc.tracer.NewTrace(call.Args.Header())
	defer trace.Finalize()
	defer trace.Epoch(fmt.Sprintf("sending %s", call.Method()))()
	defer func() {
		if err := call.Reply.Header().GoError(); err != nil {
			trace.Event(fmt.Sprintf("reply error: %T", err))
		}
	}()
	ctx = tracer.ToCtx(ctx, trace)

	// Process batch specially; otherwise, send via wrapped sender.
	switch args := call.Args.(type) {
	case *proto.InternalBatchRequest:
		trace.Event("batch processing")
		tc.sendBatch(ctx, args, call.Reply.(*proto.InternalBatchResponse))
	case *proto.BatchRequest:
		// Convert the batch request to internal-batch request.
		internalArgs := &proto.InternalBatchRequest{RequestHeader: args.RequestHeader}
		internalReply := &proto.InternalBatchResponse{}
		for i := range args.Requests {
			internalArgs.Add(args.Requests[i].GetValue().(proto.Request))
		}
		tc.sendBatch(ctx, internalArgs, internalReply)
		reply := call.Reply.(*proto.BatchResponse)
		reply.ResponseHeader = internalReply.ResponseHeader
		// Convert from internal-batch response to batch response.
		for i := range internalReply.Responses {
			reply.Add(internalReply.Responses[i].GetValue().(proto.Response))
		}
	default:
		tc.sendOne(ctx, call)
	}
}
Example #9
0
// MaybeWrapCall returns a new call which wraps the original Args and Reply
// in a batch, if necessary.
// TODO(tschottdorf): will go when proto.Call does.
func MaybeWrapCall(call proto.Call) (proto.Call, func(proto.Call) proto.Call) {
	var unwrap func(proto.Response) proto.Response
	call.Args, unwrap = MaybeWrap(call.Args)
	newUnwrap := func(origReply proto.Response) func(proto.Call) proto.Call {
		return func(newCall proto.Call) proto.Call {
			origReply.Reset()
			gogoproto.Merge(origReply, unwrap(newCall.Reply))
			*origReply.Header() = *newCall.Reply.Header()
			newCall.Reply = origReply
			return newCall
		}
	}(call.Reply)
	call.Reply = call.Args.CreateReply()
	return call, newUnwrap
}
Example #10
0
// Send implements the client.Sender interface. It verifies
// permissions and looks up the appropriate range based on the
// supplied key and sends the RPC according to the specified options.
//
// If the request spans multiple ranges (which is possible for Scan or
// DeleteRange requests), Send sends requests to the individual ranges
// sequentially and combines the results transparently.
//
// This may temporarily adjust the request headers, so the proto.Call
// must not be used concurrently until Send has returned.
func (ds *DistSender) Send(_ context.Context, call proto.Call) {
	args := call.Args
	finalReply := call.Reply

	// Verify permissions.
	if err := ds.verifyPermissions(call.Args); err != nil {
		call.Reply.Header().SetGoError(err)
		return
	}

	// In the event that timestamp isn't set and read consistency isn't
	// required, set the timestamp using the local clock.
	if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) {
		// Make sure that after the call, args hasn't changed.
		defer func(timestamp proto.Timestamp) {
			args.Header().Timestamp = timestamp
		}(args.Header().Timestamp)
		args.Header().Timestamp = ds.clock.Now()
	}

	// If this is a bounded request, we will change its bound as we receive
	// replies. This undoes that when we return.
	boundedArgs, argsBounded := args.(proto.Bounded)

	if argsBounded {
		defer func(bound int64) {
			boundedArgs.SetBound(bound)
		}(boundedArgs.GetBound())
	}

	defer func(key proto.Key) {
		args.Header().Key = key
	}(args.Header().Key)

	// Retry logic for lookup of range by key and RPCs to range replicas.
	curReply := finalReply
	for {
		call.Reply = curReply
		curReply.Header().Reset()

		var desc, descNext *proto.RangeDescriptor
		var err error
		for r := retry.Start(ds.rpcRetryOptions); r.Next(); {
			// Get range descriptor (or, when spanning range, descriptors).
			// sendAttempt below may clear them on certain errors, so we
			// refresh (likely from the cache) on every retry.
			desc, descNext, err = ds.getDescriptors(call)
			// getDescriptors may fail retryably if the first range isn't
			// available via Gossip.
			if err != nil {
				if rErr, ok := err.(util.Retryable); ok && rErr.CanRetry() {
					if log.V(1) {
						log.Warning(err)
					}
					continue
				}
				break
			}
			err = func() error {
				// Truncate the request to our current range, making sure not to
				// touch it unless we have to (it is illegal to send EndKey on
				// commands which do not operate on ranges).
				if descNext != nil {
					defer func(endKey proto.Key) {
						args.Header().EndKey = endKey
					}(args.Header().EndKey)
					args.Header().EndKey = desc.EndKey
				}
				leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID))

				// Try to send the call.
				replicas := newReplicaSlice(ds.gossip, desc)

				// Rearrange the replicas so that those replicas with long common
				// prefix of attributes end up first. If there's no prefix, this is a
				// no-op.
				order := ds.optimizeReplicaOrder(replicas)

				// If this request needs to go to a leader and we know who that is, move
				// it to the front.
				if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) &&
					leader.StoreID > 0 {
					if i := replicas.FindReplica(leader.StoreID); i >= 0 {
						replicas.MoveToFront(i)
						order = rpc.OrderStable
					}
				}

				return ds.sendRPC(desc.RaftID, replicas, order, args, curReply)
			}()
			if err != nil {
				// For an RPC error to occur, we must've been unable to contact any
				// replicas. In this case, likely all nodes are down (or not getting back
				// to us within a reasonable amount of time).
				// We may simply not be trying to talk to the up-to-date replicas, so
				// clearing the descriptor here should be a good idea.
				// TODO(tschottdorf): If a replica group goes dead, this will cause clients
				// to put high read pressure on the first range, so there should be some
				// rate limiting here.
				ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
			} else {
				err = curReply.Header().GoError()
			}

			if err != nil {
				if log.V(1) {
					log.Warningf("failed to invoke %s: %s", call.Method(), err)
				}

				// If retryable, allow retry. For range not found or range
				// key mismatch errors, we don't backoff on the retry,
				// but reset the backoff loop so we can retry immediately.
				switch tErr := err.(type) {
				case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError:
					// Range descriptor might be out of date - evict it.
					ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
					// On addressing errors, don't backoff; retry immediately.
					r.Reset()
					if log.V(1) {
						log.Warning(err)
					}
					continue
				case *proto.NotLeaderError:
					newLeader := tErr.GetLeader()
					// Verify that leader is a known replica according to the
					// descriptor. If not, we've got a stale replica; evict cache.
					// Next, cache the new leader.
					if newLeader != nil {
						if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 {
							if log.V(1) {
								log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc)
							}
							ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
						}
					} else {
						newLeader = &proto.Replica{}
					}
					ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader)
					if log.V(1) {
						log.Warning(err)
					}
					r.Reset()
					continue
				case util.Retryable:
					if tErr.CanRetry() {
						if log.V(1) {
							log.Warning(err)
						}
						continue
					}
				}
			}
			break
		}

		// Immediately return if querying a range failed non-retryably.
		// For multi-range requests, we return the failing range's reply.
		if err != nil {
			call.Reply.Header().SetGoError(err)
			return
		}

		if finalReply != curReply {
			// This was the second or later call in a multi-range request.
			// Combine the new response with the existing one.
			if cFinalReply, ok := finalReply.(proto.Combinable); ok {
				cFinalReply.Combine(curReply)
			} else {
				// This should never apply in practice, as we'll only end up here
				// for range-spanning requests.
				call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type"))
				return
			}
		}

		// If this request has a bound, such as MaxResults in
		// ScanRequest, check whether enough rows have been retrieved.
		if argsBounded {
			if prevBound := boundedArgs.GetBound(); prevBound > 0 {
				if cReply, ok := curReply.(proto.Countable); ok {
					if nextBound := prevBound - cReply.Count(); nextBound > 0 {
						// Update bound for the next round.
						// We've deferred restoring the original bound earlier.
						boundedArgs.SetBound(nextBound)
					} else {
						// Set flag to break the loop.
						descNext = nil
					}
				}
			}
		}

		// If this was the last range accessed by this call, exit loop.
		if descNext == nil {
			break
		}

		// In next iteration, query next range.
		// It's important that we use the EndKey of the current descriptor
		// as opposed to the StartKey of the next one: if the former is stale,
		// it's possible that the next range has since merged the subsequent
		// one, and unless both descriptors are stale, the next descriptor's
		// StartKey would move us to the beginning of the current range,
		// resulting in a duplicate scan.
		args.Header().Key = desc.EndKey

		// This is a multi-range request, make a new reply object for
		// subsequent iterations of the loop.
		curReply = args.CreateReply()
	}
	call.Reply = finalReply
}
Example #11
0
// sendOne sends a single call via the wrapped sender. If the call is
// part of a transaction, the TxnCoordSender adds the transaction to a
// map of active transactions and begins heartbeating it. Every
// subsequent call for the same transaction updates the lastUpdate
// timestamp to prevent live transactions from being considered
// abandoned and garbage collected. Read/write mutating requests have
// their key or key range added to the transaction's interval tree of
// key ranges for eventual cleanup via resolved write intents.
//
// On success, and if the call is part of a transaction, the affected
// key range is recorded as live intents for eventual cleanup upon
// transaction commit. Upon successful txn commit, initiates cleanup
// of intents.
func (tc *TxnCoordSender) sendOne(ctx context.Context, call proto.Call) {
	var startNS int64
	header := call.Args.Header()
	trace := tracer.FromCtx(ctx)
	var id string // optional transaction ID
	if header.Txn != nil {
		// If this call is part of a transaction...
		id = string(header.Txn.ID)
		// Verify that if this Transaction is not read-only, we have it on
		// file. If not, refuse writes - the client must have issued a write on
		// another coordinator previously.
		if header.Txn.Writing && proto.IsTransactionWrite(call.Args) {
			tc.Lock()
			_, ok := tc.txns[id]
			tc.Unlock()
			if !ok {
				call.Reply.Header().SetGoError(util.Errorf(
					"transaction must not write on multiple coordinators"))
				return
			}
		}

		// Set the timestamp to the original timestamp for read-only
		// commands and to the transaction timestamp for read/write
		// commands.
		if proto.IsReadOnly(call.Args) {
			header.Timestamp = header.Txn.OrigTimestamp
		} else {
			header.Timestamp = header.Txn.Timestamp
		}

		if args, ok := call.Args.(*proto.EndTransactionRequest); ok {
			// Remember when EndTransaction started in case we want to
			// be linearizable.
			startNS = tc.clock.PhysicalNow()
			// EndTransaction must have its key set to that of the txn.
			header.Key = header.Txn.Key
			if len(args.Intents) > 0 {
				// TODO(tschottdorf): it may be useful to allow this later.
				// That would be part of a possible plan to allow txns which
				// write on multiple coordinators.
				call.Reply.Header().SetGoError(util.Errorf(
					"client must not pass intents to EndTransaction"))
				return
			}
			tc.Lock()
			txnMeta, metaOK := tc.txns[id]
			if id != "" && metaOK {
				args.Intents = txnMeta.intents()
			}
			tc.Unlock()

			if !metaOK {
				// If we don't have the transaction, then this must be a retry
				// by the client. We can no longer reconstruct a correct
				// request so we must fail.
				//
				// TODO(bdarnell): if we had a GetTransactionStatus API then
				// we could lookup the transaction and return either nil or
				// TransactionAbortedError instead of this ambivalent error.
				call.Reply.Header().SetGoError(util.Errorf(
					"transaction is already committed or aborted"))
				return
			} else if len(args.Intents) == 0 {
				// If there aren't any intents, then there's factually no
				// transaction to end. Read-only txns have all of their state in
				// the client.
				call.Reply.Header().SetGoError(util.Errorf(
					"cannot commit a read-only transaction"))
				return
			}
		}
	}

	// Send the command through wrapped sender.
	tc.wrapped.Send(ctx, call)

	// For transactional calls, need to track & update the transaction.
	if header.Txn != nil {
		respHeader := call.Reply.Header()
		if respHeader.Txn == nil {
			// When empty, simply use the request's transaction.
			// This is expected: the Range doesn't bother copying unless the
			// object changes.
			respHeader.Txn = gogoproto.Clone(header.Txn).(*proto.Transaction)
		}
		tc.updateResponseTxn(header, respHeader)
	}

	if txn := call.Reply.Header().Txn; txn != nil {
		if !header.Txn.Equal(txn) {
			panic("transaction ID changed")
		}
		tc.Lock()
		txnMeta := tc.txns[id]
		// If this transactional command leaves transactional intents, add the key
		// or key range to the intents map. If the transaction metadata doesn't yet
		// exist, create it.
		if call.Reply.Header().GoError() == nil {
			if proto.IsTransactionWrite(call.Args) {
				if txnMeta == nil {
					txn.Writing = true
					trace.Event("coordinator spawns")
					txnMeta = &txnMetadata{
						txn:              *txn,
						keys:             cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}),
						firstUpdateNanos: tc.clock.PhysicalNow(),
						lastUpdateNanos:  tc.clock.PhysicalNow(),
						timeoutDuration:  tc.clientTimeout,
						txnEnd:           make(chan struct{}),
					}
					tc.txns[id] = txnMeta
					if !tc.stopper.RunAsyncTask(func() {
						tc.heartbeatLoop(id)
					}) {
						// The system is already draining and we can't start the
						// heartbeat. We refuse new transactions for now because
						// they're likely not going to have all intents committed.
						// In principle, we can relax this as needed though.
						call.Reply.Header().SetGoError(&proto.NodeUnavailableError{})
						tc.Unlock()
						tc.unregisterTxn(id)
						return
					}
				}
				txnMeta.addKeyRange(header.Key, header.EndKey)
			}
			// Update our record of this transaction.
			if txnMeta != nil {
				txnMeta.txn = *txn
				txnMeta.setLastUpdate(tc.clock.PhysicalNow())
			}
		}
		tc.Unlock()
	}

	// Cleanup intents and transaction map if end of transaction.
	switch t := call.Reply.Header().GoError().(type) {
	case *proto.TransactionStatusError:
		// Likely already committed or more obscure errors such as epoch or
		// timestamp regressions; consider it dead.
		tc.cleanupTxn(trace, t.Txn)
	case *proto.TransactionAbortedError:
		// If already aborted, cleanup the txn on this TxnCoordSender.
		tc.cleanupTxn(trace, t.Txn)
	case *proto.OpRequiresTxnError:
		// Run a one-off transaction with that single command.
		if log.V(1) {
			log.Infof("%s: auto-wrapping in txn and re-executing", call.Method())
		}
		// TODO(tschottdorf): this part is awkward. Consider resending here
		// without starting a new call, which is hard to trace. Plus, the
		// below depends on default configuration.
		tmpDB, err := client.Open(
			fmt.Sprintf("//%s?priority=%d",
				call.Args.Header().User, call.Args.Header().GetUserPriority()),
			client.SenderOpt(tc))
		if err != nil {
			log.Warning(err)
			return
		}
		call.Reply.Reset()
		if err := tmpDB.Txn(func(txn *client.Txn) error {
			txn.SetDebugName("auto-wrap", 0)
			b := &client.Batch{}
			b.InternalAddCall(call)
			return txn.CommitInBatch(b)
		}); err != nil {
			log.Warning(err)
		}
	case nil:
		if txn := call.Reply.Header().Txn; txn != nil {
			if _, ok := call.Args.(*proto.EndTransactionRequest); ok {
				// If the --linearizable flag is set, we want to make sure that
				// all the clocks in the system are past the commit timestamp
				// of the transaction. This is guaranteed if either
				// - the commit timestamp is MaxOffset behind startNS
				// - MaxOffset ns were spent in this function
				// when returning to the client. Below we choose the option
				// that involves less waiting, which is likely the first one
				// unless a transaction commits with an odd timestamp.
				if tsNS := txn.Timestamp.WallTime; startNS > tsNS {
					startNS = tsNS
				}
				sleepNS := tc.clock.MaxOffset() -
					time.Duration(tc.clock.PhysicalNow()-startNS)
				if tc.linearizable && sleepNS > 0 {
					defer func() {
						if log.V(1) {
							log.Infof("%v: waiting %s on EndTransaction for linearizability", txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond))
						}
						time.Sleep(sleepNS)
					}()
				}
				if txn.Status != proto.PENDING {
					tc.cleanupTxn(trace, *txn)
				}

			}
		}
	}
}
Example #12
0
// resolve sends resolve intent commands for all key ranges this transaction
// has covered. Any keys listed in the resolved slice have already been
// resolved and are skipped.
func (tm *txnMetadata) resolve(trace *tracer.Trace, resolved []proto.Key, sender client.Sender) {
	txn := &tm.txn
	if tm.keys.Len() > 0 {
		if log.V(2) {
			log.Infof("cleaning up %d intent(s) for transaction %s", tm.keys.Len(), txn)
		}
	}
	// TODO(tschottdorf): Should create a Batch here. However, we're resolving
	// intents and if those are on meta records, there may be a certain order
	// in which they need to be resolved so that they can get routed to the
	// correct range. Since a batch runs its commands one by one and we don't
	// know the correct order, we prefer to fire them off in parallel.
	var wg sync.WaitGroup
	for _, o := range tm.keys.GetOverlaps(proto.KeyMin, proto.KeyMax) {
		// If the op was range based, end key != start key: resolve a range.
		var call proto.Call
		key := o.Key.Start().(proto.Key)
		endKey := o.Key.End().(proto.Key)
		if !key.Next().Equal(endKey) {
			call.Args = &proto.InternalResolveIntentRangeRequest{
				RequestHeader: proto.RequestHeader{
					Timestamp: txn.Timestamp,
					Key:       key,
					EndKey:    endKey,
					User:      security.RootUser,
					Txn:       txn,
				},
			}
			call.Reply = &proto.InternalResolveIntentRangeResponse{}
		} else {
			// Check if the key has already been resolved; skip if yes.
			found := false
			for _, k := range resolved {
				if key.Equal(k) {
					if log.V(2) {
						log.Warningf("skipping previously resolved intent at %q", k)
					}
					found = true
				}
			}
			if found {
				continue
			}
			call.Args = &proto.InternalResolveIntentRequest{
				RequestHeader: proto.RequestHeader{
					Timestamp: txn.Timestamp,
					Key:       key,
					User:      security.RootUser,
					Txn:       txn,
				},
			}
			call.Reply = &proto.InternalResolveIntentResponse{}
		}
		ctx := tracer.ToCtx(context.Background(), trace.Fork())
		if log.V(2) {
			log.Infof("cleaning up intent %q for txn %s", call.Args.Header().Key, txn)
		}
		// Each operation gets their own goroutine. We only want to return to
		// the caller after the operations have finished.
		wg.Add(1)
		go func() {
			sender.Send(ctx, call)
			wg.Done()
			if call.Reply.Header().Error != nil {
				log.Warningf("failed to cleanup %q intent: %s", call.Args.Header().Key, call.Reply.Header().GoError())
			}
		}()
	}
	defer trace.Epoch("waiting for intent resolution")()
	wg.Wait()
	tm.keys.Clear()
}
Example #13
0
// Send implements the client.Sender interface. It verifies
// permissions and looks up the appropriate range based on the
// supplied key and sends the RPC according to the specified options.
//
// If the request spans multiple ranges (which is possible for Scan or
// DeleteRange requests), Send sends requests to the individual ranges
// sequentially and combines the results transparently.
//
// This may temporarily adjust the request headers, so the proto.Call
// must not be used concurrently until Send has returned.
func (ds *DistSender) Send(_ context.Context, call proto.Call) {
	args := call.Args
	finalReply := call.Reply
	endKey := args.Header().EndKey

	// Verify permissions.
	if err := ds.verifyPermissions(call.Args); err != nil {
		call.Reply.Header().SetGoError(err)
		return
	}

	// In the event that timestamp isn't set and read consistency isn't
	// required, set the timestamp using the local clock.
	if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) {
		// Make sure that after the call, args hasn't changed.
		defer func(timestamp proto.Timestamp) {
			args.Header().Timestamp = timestamp
		}(args.Header().Timestamp)
		args.Header().Timestamp = ds.clock.Now()
	}

	// If this is a bounded request, we will change its bound as we receive
	// replies. This undoes that when we return.
	boundedArgs, _ := args.(proto.Bounded)

	if boundedArgs != nil {
		defer func(n int64) {
			boundedArgs.SetBound(n)
		}(boundedArgs.GetBound())
	}

	// Retry logic for lookup of range by key and RPCs to range replicas.
	retryOpts := ds.rpcRetryOptions
	retryOpts.Tag = "routing " + call.Method().String() + " rpc"

	curReply := finalReply
	for {
		call.Reply = curReply
		curReply.Header().Reset()

		var desc, descNext *proto.RangeDescriptor
		err := retry.WithBackoff(retryOpts, func() (retry.Status, error) {
			var err error
			// Get range descriptor (or, when spanning range, descriptors).
			// sendAttempt below may clear them on certain errors, so we
			// refresh (likely from the cache) on every retry.
			desc, descNext, err = ds.getDescriptors(call)
			// getDescriptors may fail retryably if the first range isn't
			// available via Gossip.
			if err != nil {
				if rErr, ok := err.(util.Retryable); ok && rErr.CanRetry() {
					return retry.Continue, err
				}
				return retry.Break, err
			}
			// Truncate the request to our current range, making sure not to
			// touch it unless we have to (it is illegal to send EndKey on
			// commands which do not operate on ranges).
			if descNext != nil {
				args.Header().EndKey = desc.EndKey
				defer func() {
					// "Untruncate" EndKey to original.
					args.Header().EndKey = endKey
				}()
			}
			return ds.sendAttempt(desc, call)
		})

		// Immediately return if querying a range failed non-retryably.
		// For multi-range requests, we return the failing range's reply.
		if err != nil {
			call.Reply.Header().SetGoError(err)
			return
		}

		if finalReply != curReply {
			// This was the second or later call in a multi-range request.
			// Combine the new response with the existing one.
			if cFinalReply, ok := finalReply.(proto.Combinable); ok {
				cFinalReply.Combine(curReply)
			} else {
				// This should never apply in practice, as we'll only end up here
				// for range-spanning requests.
				call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type"))
				return
			}
		}

		// If this request has a bound, such as MaxResults in
		// ScanRequest, check whether enough rows have been retrieved.
		if boundedArgs != nil {
			if prevBound := boundedArgs.GetBound(); prevBound > 0 {
				if cReply, ok := curReply.(proto.Countable); ok {
					if nextBound := prevBound - cReply.Count(); nextBound > 0 {
						// Update bound for the next round.
						// We've deferred restoring the original bound earlier.
						boundedArgs.SetBound(nextBound)
					} else {
						// Set flag to break the loop.
						descNext = nil
					}
				}
			}
		}

		// If this was the last range accessed by this call, exit loop.
		if descNext == nil {
			break
		}

		if curReply == finalReply {
			// This is the end of the first iteration in a multi-range query,
			// so it's a convenient place to clean up changes to the args in
			// the case of multi-range requests.
			// Reset original start key (the EndKey is taken care of without
			// defer above).
			defer func(k proto.Key) {
				args.Header().Key = k
			}(args.Header().Key)
		}

		// In next iteration, query next range.
		args.Header().Key = descNext.StartKey

		// This is a multi-range request, make a new reply object for
		// subsequent iterations of the loop.
		curReply = args.CreateReply()
	}
	call.Reply = finalReply
}
Example #14
0
// Send sends call to Cockroach via an HTTP post. HTTP response codes
// which are retryable are retried with backoff in a loop using the
// default retry options.
func (s *httpSender) Send(_ context.Context, call proto.Call) {
	if err := HTTPPost(s.ctx, call.Args, call.Reply, call.Method()); err != nil {
		call.Reply.Header().SetGoError(err)
	}
}
Example #15
0
// sendAttempt is invoked by Send and handles retry logic and cache eviction
// for a call sent to a single range. It returns a retry status, which is Break
// on success and either Break, Continue or Reset depending on error condition.
// This method is expected to be invoked from within a backoff / retry loop to
// retry the send repeatedly (e.g. to continue processing after a critical node
// becomes available after downtime or the range descriptor is refreshed via
// lookup).
func (ds *DistSender) sendAttempt(desc *proto.RangeDescriptor, call proto.Call) (retry.Status, error) {
	leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID))

	// Try to send the call.
	replicas := newReplicaSlice(ds.gossip, desc)

	// Rearrange the replicas so that those replicas with long common
	// prefix of attributes end up first. If there's no prefix, this is a
	// no-op.
	order := ds.optimizeReplicaOrder(replicas)

	args := call.Args
	reply := call.Reply

	// If this request needs to go to a leader and we know who that is, move
	// it to the front.
	if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) &&
		leader.StoreID > 0 {
		if i := replicas.FindReplica(leader.StoreID); i >= 0 {
			replicas.MoveToFront(i)
			order = rpc.OrderStable
		}
	}

	err := ds.sendRPC(desc.RaftID, replicas, order, args, reply)
	if err != nil {
		// For an RPC error to occur, we must've been unable to contact any
		// replicas. In this case, likely all nodes are down (or not getting back
		// to us within a reasonable amount of time).
		// We may simply not be trying to talk to the up-to-date replicas, so
		// clearing the descriptor here should be a good idea.
		// TODO(tschottdorf): If a replica group goes dead, this will cause clients
		// to put high read pressure on the first range, so there should be some
		// rate limiting here.
		ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
	} else {
		err = reply.Header().GoError()
	}

	if err != nil {
		if log.V(1) {
			log.Warningf("failed to invoke %s: %s", call.Method(), err)
		}

		// If retryable, allow retry. For range not found or range
		// key mismatch errors, we don't backoff on the retry,
		// but reset the backoff loop so we can retry immediately.
		switch tErr := err.(type) {
		case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError:
			// Range descriptor might be out of date - evict it.
			ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
			// On addressing errors, don't backoff; retry immediately.
			return retry.Reset, err
		case *proto.NotLeaderError:
			newLeader := tErr.GetLeader()
			// Verify that leader is a known replica according to the
			// descriptor. If not, we've got a stale replica; evict cache.
			// Next, cache the new leader.
			if newLeader != nil {
				if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 {
					if log.V(1) {
						log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc)
					}
					ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc)
				}
			} else {
				newLeader = &proto.Replica{}
			}
			ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader)
			return retry.Reset, err
		case util.Retryable:
			if tErr.CanRetry() {
				return retry.Continue, err
			}
		}
		return retry.Break, err
	}
	return retry.Break, nil
}
Example #16
0
// Send implements the client.Sender interface. It verifies
// permissions and looks up the appropriate range based on the
// supplied key and sends the RPC according to the specified options.
//
// If the request spans multiple ranges (which is possible for Scan or
// DeleteRange requests), Send sends requests to the individual ranges
// sequentially and combines the results transparently.
//
// This may temporarily adjust the request headers, so the proto.Call
// must not be used concurrently until Send has returned.
func (ds *DistSender) Send(ctx context.Context, call proto.Call) {
	args := call.Args

	// Verify permissions.
	if err := ds.verifyPermissions(call.Args); err != nil {
		call.Reply.Header().SetGoError(err)
		return
	}

	trace := tracer.FromCtx(ctx)

	// In the event that timestamp isn't set and read consistency isn't
	// required, set the timestamp using the local clock.
	if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) {
		// Make sure that after the call, args hasn't changed.
		defer func(timestamp proto.Timestamp) {
			args.Header().Timestamp = timestamp
		}(args.Header().Timestamp)
		args.Header().Timestamp = ds.clock.Now()
	}

	// If this is a bounded request, we will change its bound as we receive
	// replies. This undoes that when we return.
	boundedArgs, argsBounded := args.(proto.Bounded)

	if argsBounded {
		defer func(bound int64) {
			boundedArgs.SetBound(bound)
		}(boundedArgs.GetBound())
	}
	_, isReverseScan := call.Args.(*proto.ReverseScanRequest)
	// Restore to the original range if the scan/reverse_scan crosses range boundaries.
	if isReverseScan {
		defer func(key proto.Key) {
			args.Header().EndKey = key
		}(args.Header().EndKey)
	} else {
		defer func(key proto.Key) {
			args.Header().Key = key
		}(args.Header().Key)
	}

	first := true

	// Retry logic for lookup of range by key and RPCs to range replicas.
	for {
		var curReply proto.Response
		var desc, descNext *proto.RangeDescriptor
		var err error
		for r := retry.Start(ds.rpcRetryOptions); r.Next(); {
			// Get range descriptor (or, when spanning range, descriptors). Our
			// error handling below may clear them on certain errors, so we
			// refresh (likely from the cache) on every retry.
			descDone := trace.Epoch("meta descriptor lookup")
			// It is safe to pass call here (with its embedded reply) because
			// the reply is only used to check that it implements
			// proto.Combinable if the request spans multiple ranges.
			desc, descNext, err = ds.getDescriptors(call)
			descDone()
			// getDescriptors may fail retryably if the first range isn't
			// available via Gossip.
			if err != nil {
				if rErr, ok := err.(retry.Retryable); ok && rErr.CanRetry() {
					if log.V(1) {
						log.Warning(err)
					}
					continue
				}
				break
			}
			// At this point reply.Header().Error may be non-nil!
			curReply, err = ds.sendAttempt(trace, args, desc)

			descKey := args.Header().Key
			if isReverseScan {
				descKey = args.Header().EndKey
			}

			if err != nil {
				trace.Event(fmt.Sprintf("send error: %T", err))
				// For an RPC error to occur, we must've been unable to contact any
				// replicas. In this case, likely all nodes are down (or not getting back
				// to us within a reasonable amount of time).
				// We may simply not be trying to talk to the up-to-date replicas, so
				// clearing the descriptor here should be a good idea.
				// TODO(tschottdorf): If a replica group goes dead, this will cause clients
				// to put high read pressure on the first range, so there should be some
				// rate limiting here.
				ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, isReverseScan)
			} else {
				err = curReply.Header().GoError()
			}

			if err == nil {
				break
			}

			if log.V(1) {
				log.Warningf("failed to invoke %s: %s", call.Method(), err)
			}

			// If retryable, allow retry. For range not found or range
			// key mismatch errors, we don't backoff on the retry,
			// but reset the backoff loop so we can retry immediately.
			switch tErr := err.(type) {
			case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError:
				trace.Event(fmt.Sprintf("reply error: %T", err))
				// Range descriptor might be out of date - evict it.
				ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, isReverseScan)
				// On addressing errors, don't backoff; retry immediately.
				r.Reset()
				if log.V(1) {
					log.Warning(err)
				}
				continue
			case *proto.NotLeaderError:
				trace.Event(fmt.Sprintf("reply error: %T", err))
				newLeader := tErr.GetLeader()
				// Verify that leader is a known replica according to the
				// descriptor. If not, we've got a stale replica; evict cache.
				// Next, cache the new leader.
				if newLeader != nil {
					if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 {
						if log.V(1) {
							log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc)
						}
						ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, isReverseScan)
					}
				} else {
					newLeader = &proto.Replica{}
				}
				ds.updateLeaderCache(proto.RangeID(desc.RangeID), *newLeader)
				if log.V(1) {
					log.Warning(err)
				}
				r.Reset()
				continue
			case retry.Retryable:
				if tErr.CanRetry() {
					if log.V(1) {
						log.Warning(err)
					}
					trace.Event(fmt.Sprintf("reply error: %T", err))
					continue
				}
			}
			break
		}

		// Immediately return if querying a range failed non-retryably.
		// For multi-range requests, we return the failing range's reply.
		if err != nil {
			call.Reply.Header().SetGoError(err)
			return
		}

		if first {
			// Equivalent of `*call.Reply = curReply`. Generics!
			dst := reflect.ValueOf(call.Reply).Elem()
			dst.Set(reflect.ValueOf(curReply).Elem())
		} else {
			// This was the second or later call in a multi-range request.
			// Combine the new response with the existing one.
			if cReply, ok := call.Reply.(proto.Combinable); ok {
				cReply.Combine(curReply)
			} else {
				// This should never apply in practice, as we'll only end up here
				// for range-spanning requests.
				call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type"))
				return
			}
		}

		first = false

		// If this request has a bound, such as MaxResults in
		// ScanRequest, check whether enough rows have been retrieved.
		if argsBounded {
			if prevBound := boundedArgs.GetBound(); prevBound > 0 {
				if cReply, ok := curReply.(proto.Countable); ok {
					if nextBound := prevBound - cReply.Count(); nextBound > 0 {
						// Update bound for the next round.
						// We've deferred restoring the original bound earlier.
						boundedArgs.SetBound(nextBound)
					} else {
						// Set flag to break the loop.
						descNext = nil
					}
				}
			}
		}

		// If this was the last range accessed by this call, exit loop.
		if descNext == nil {
			break
		}

		if isReverseScan {
			// In next iteration, query previous range.
			// We use the StartKey of the current descriptor as opposed to the
			// EndKey of the previous one.
			args.Header().EndKey = desc.StartKey
		} else {
			// In next iteration, query next range.
			// It's important that we use the EndKey of the current descriptor
			// as opposed to the StartKey of the next one: if the former is stale,
			// it's possible that the next range has since merged the subsequent
			// one, and unless both descriptors are stale, the next descriptor's
			// StartKey would move us to the beginning of the current range,
			// resulting in a duplicate scan.
			args.Header().Key = desc.EndKey
		}
		trace.Event("querying next range")
	}
}
Example #17
0
// sendOne sends a single call via the wrapped sender. If the call is
// part of a transaction, the TxnCoordSender adds the transaction to a
// map of active transactions and begins heartbeating it. Every
// subsequent call for the same transaction updates the lastUpdate
// timestamp to prevent live transactions from being considered
// abandoned and garbage collected. Read/write mutating requests have
// their key or key range added to the transaction's interval tree of
// key ranges for eventual cleanup via resolved write intents.
//
// On success, and if the call is part of a transaction, the affected
// key range is recorded as live intents for eventual cleanup upon
// transaction commit. Upon successful txn commit, initiates cleanup
// of intents.
func (tc *TxnCoordSender) sendOne(call proto.Call) {
	var startNS int64
	header := call.Args.Header()
	// If this call is part of a transaction...
	if header.Txn != nil {
		// Set the timestamp to the original timestamp for read-only
		// commands and to the transaction timestamp for read/write
		// commands.
		if proto.IsReadOnly(call.Args) {
			header.Timestamp = header.Txn.OrigTimestamp
		} else {
			header.Timestamp = header.Txn.Timestamp
		}
		// EndTransaction must have its key set to that of the txn.
		if _, ok := call.Args.(*proto.EndTransactionRequest); ok {
			header.Key = header.Txn.Key
			// Remember when EndTransaction started in case we want to
			// be linearizable.
			startNS = tc.clock.PhysicalNow()
		}
	}

	// Send the command through wrapped sender.
	tc.wrapped.Send(context.TODO(), call)

	if header.Txn != nil {
		// If not already set, copy the request txn.
		if call.Reply.Header().Txn == nil {
			call.Reply.Header().Txn = gogoproto.Clone(header.Txn).(*proto.Transaction)
		}
		tc.updateResponseTxn(header, call.Reply.Header())
	}

	if txn := call.Reply.Header().Txn; txn != nil {
		tc.Lock()
		txnMeta := tc.txns[string(txn.ID)]
		// If this transactional command leaves transactional intents, add the key
		// or key range to the intents map. If the transaction metadata doesn't yet
		// exist, create it.
		if call.Reply.Header().GoError() == nil {
			if proto.IsTransactionWrite(call.Args) {
				if txnMeta == nil {
					txnMeta = &txnMetadata{
						txn:              *txn,
						keys:             cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}),
						firstUpdateNanos: tc.clock.PhysicalNow(),
						lastUpdateNanos:  tc.clock.PhysicalNow(),
						timeoutDuration:  tc.clientTimeout,
						txnEnd:           make(chan struct{}),
					}
					id := string(txn.ID)
					tc.txns[id] = txnMeta
					tc.heartbeat(id)
				}
				txnMeta.addKeyRange(header.Key, header.EndKey)
			}
			// Update our record of this transaction.
			if txnMeta != nil {
				txnMeta.txn = *txn
				txnMeta.setLastUpdate(tc.clock.PhysicalNow())
			}
		}
		tc.Unlock()
	}

	// Cleanup intents and transaction map if end of transaction.
	switch t := call.Reply.Header().GoError().(type) {
	case *proto.TransactionStatusError:
		// Likely already committed or more obscure errors such as epoch or
		// timestamp regressions; consider it dead.
		tc.cleanupTxn(t.Txn, nil)
	case *proto.TransactionAbortedError:
		// If already aborted, cleanup the txn on this TxnCoordSender.
		tc.cleanupTxn(t.Txn, nil)
	case *proto.OpRequiresTxnError:
		// Run a one-off transaction with that single command.
		if log.V(1) {
			log.Infof("%s: auto-wrapping in txn and re-executing", call.Method())
		}
		tmpDB, err := client.Open(
			fmt.Sprintf("//%s?priority=%d",
				call.Args.Header().User, call.Args.Header().GetUserPriority()),
			client.SenderOpt(tc))
		if err != nil {
			log.Warning(err)
			return
		}
		call.Reply.Reset()
		if err := tmpDB.Txn(func(txn *client.Txn) error {
			txn.SetDebugName("auto-wrap")
			b := &client.Batch{}
			b.InternalAddCall(call)
			return txn.Commit(b)
		}); err != nil {
			log.Warning(err)
		}
	case nil:
		var resolved []proto.Key
		if txn := call.Reply.Header().Txn; txn != nil {
			if _, ok := call.Args.(*proto.EndTransactionRequest); ok {
				// If the --linearizable flag is set, we want to make sure that
				// all the clocks in the system are past the commit timestamp
				// of the transaction. This is guaranteed if either
				// - the commit timestamp is MaxOffset behind startNS
				// - MaxOffset ns were spent in this function
				// when returning to the client. Below we choose the option
				// that involves less waiting, which is likely the first one
				// unless a transaction commits with an odd timestamp.
				if tsNS := txn.Timestamp.WallTime; startNS > tsNS {
					startNS = tsNS
				}
				sleepNS := tc.clock.MaxOffset() -
					time.Duration(tc.clock.PhysicalNow()-startNS)
				if tc.linearizable && sleepNS > 0 {
					defer func() {
						if log.V(1) {
							log.Infof("%v: waiting %s on EndTransaction for linearizability", txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond))
						}
						time.Sleep(sleepNS)
					}()
				}
				resolved = call.Reply.(*proto.EndTransactionResponse).Resolved
				if txn.Status != proto.PENDING {
					tc.cleanupTxn(*txn, resolved)
				}

			}
		}
	}
}
Example #18
0
// close sends resolve intent commands for all key ranges this
// transaction has covered, clears the keys cache and closes the
// metadata heartbeat. Any keys listed in the resolved slice have
// already been resolved and do not receive resolve intent commands.
func (tm *txnMetadata) close(txn *proto.Transaction, resolved []proto.Key, sender client.Sender, stopper *util.Stopper) {
	close(tm.txnEnd) // stop heartbeat
	if tm.keys.Len() > 0 {
		if log.V(2) {
			log.Infof("cleaning up %d intent(s) for transaction %s", tm.keys.Len(), txn)
		}
	}
	for _, o := range tm.keys.GetOverlaps(proto.KeyMin, proto.KeyMax) {
		// If the op was range based, end key != start key: resolve a range.
		var call proto.Call
		key := o.Key.Start().(proto.Key)
		endKey := o.Key.End().(proto.Key)
		if !key.Next().Equal(endKey) {
			call.Args = &proto.InternalResolveIntentRangeRequest{
				RequestHeader: proto.RequestHeader{
					Timestamp: txn.Timestamp,
					Key:       key,
					EndKey:    endKey,
					User:      storage.UserRoot,
					Txn:       txn,
				},
			}
			call.Reply = &proto.InternalResolveIntentRangeResponse{}
		} else {
			// Check if the key has already been resolved; skip if yes.
			found := false
			for _, k := range resolved {
				if key.Equal(k) {
					found = true
				}
			}
			if found {
				continue
			}
			call.Args = &proto.InternalResolveIntentRequest{
				RequestHeader: proto.RequestHeader{
					Timestamp: txn.Timestamp,
					Key:       key,
					User:      storage.UserRoot,
					Txn:       txn,
				},
			}
			call.Reply = &proto.InternalResolveIntentResponse{}
		}
		// We don't care about the reply channel; these are best
		// effort. We simply fire and forget, each in its own goroutine.
		if stopper.StartTask() {
			go func() {
				if log.V(2) {
					log.Infof("cleaning up intent %q for txn %s", call.Args.Header().Key, txn)
				}
				sender.Send(context.TODO(), call)
				if call.Reply.Header().Error != nil {
					log.Warningf("failed to cleanup %q intent: %s", call.Args.Header().Key, call.Reply.Header().GoError())
				}
				stopper.FinishTask()
			}()
		}
	}
	tm.keys.Clear()
}
// close sends resolve intent commands for all key ranges this
// transaction has covered, clears the keys cache and closes the
// metadata heartbeat. Any keys listed in the resolved slice have
// already been resolved and do not receive resolve intent commands.
func (tm *txnMetadata) close(trace *tracer.Trace, txn *proto.Transaction, resolved []proto.Key, sender client.Sender, stopper *stop.Stopper) {
	close(tm.txnEnd) // stop heartbeat
	trace.Event("coordinator stops")
	if tm.keys.Len() > 0 {
		if log.V(2) {
			log.Infof("cleaning up %d intent(s) for transaction %s", tm.keys.Len(), txn)
		}
	}
	// TODO(tschottdorf): Should create a Batch here.
	for _, o := range tm.keys.GetOverlaps(proto.KeyMin, proto.KeyMax) {
		// If the op was range based, end key != start key: resolve a range.
		var call proto.Call
		key := o.Key.Start().(proto.Key)
		endKey := o.Key.End().(proto.Key)
		if !key.Next().Equal(endKey) {
			call.Args = &proto.InternalResolveIntentRangeRequest{
				RequestHeader: proto.RequestHeader{
					Timestamp: txn.Timestamp,
					Key:       key,
					EndKey:    endKey,
					User:      security.RootUser,
					Txn:       txn,
				},
			}
			call.Reply = &proto.InternalResolveIntentRangeResponse{}
		} else {
			// Check if the key has already been resolved; skip if yes.
			found := false
			for _, k := range resolved {
				if key.Equal(k) {
					found = true
				}
			}
			if found {
				continue
			}
			call.Args = &proto.InternalResolveIntentRequest{
				RequestHeader: proto.RequestHeader{
					Timestamp: txn.Timestamp,
					Key:       key,
					User:      security.RootUser,
					Txn:       txn,
				},
			}
			call.Reply = &proto.InternalResolveIntentResponse{}
		}
		// We don't care about the reply channel; these are best
		// effort. We simply fire and forget, each in its own goroutine.
		ctx := tracer.ToCtx(context.Background(), trace.Fork())
		stopper.RunAsyncTask(func() {
			if log.V(2) {
				log.Infof("cleaning up intent %q for txn %s", call.Args.Header().Key, txn)
			}
			sender.Send(ctx, call)
			if call.Reply.Header().Error != nil {
				log.Warningf("failed to cleanup %q intent: %s", call.Args.Header().Key, call.Reply.Header().GoError())
			}
		})
	}
	tm.keys.Clear()
}