Example #1
// updateForBatch updates the first argument (the header of a request contained
// in a batch) from the second one (the batch header), returning an error when
// inconsistencies are found.
// It is checked that the individual call does not have a User, UserPriority
// or Txn set that differs from the batch's.
func updateForBatch(args proto.Request, bHeader proto.RequestHeader) error {
	// Disallow transaction, user and priority on individual calls, unless
	// equal.
	aHeader := args.Header()
	if aHeader.User != "" && aHeader.User != bHeader.User {
		return util.Error("conflicting user on call in batch")
	if aPrio := aHeader.GetUserPriority(); aPrio != proto.Default_RequestHeader_UserPriority && aPrio != bHeader.GetUserPriority() {
		return util.Error("conflicting user priority on call in batch")
	aHeader.User = bHeader.User
	aHeader.UserPriority = bHeader.UserPriority
	// Only allow individual transactions on the requests of a batch if
	// - the batch is non-transactional,
	// - the individual transaction does not write intents, and
	// - the individual transaction is initialized.
	// The main usage of this is to allow mass-resolution of intents, which
	// entails sending a non-txn batch of transactional InternalResolveIntent.
	if aHeader.Txn != nil && !aHeader.Txn.Equal(bHeader.Txn) {
		if len(aHeader.Txn.ID) == 0 || proto.IsTransactionWrite(args) || bHeader.Txn != nil {
			return util.Error("conflicting transaction in transactional batch")
	} else {
		aHeader.Txn = bHeader.Txn
	return nil
Example #2
// ExecuteCmd synchronously runs Store.ExecuteCmd. The store is looked
// up from the store map if specified by header.Replica; otherwise,
// the command is being executed locally, and the replica is
// determined via lookup through each of the stores.
func (kv *LocalKV) ExecuteCmd(method string, args proto.Request, replyChan interface{}) {
	// If the replica isn't specified in the header, look it up.
	var err error
	var store *storage.Store
	// If we aren't given a Replica, then a little bending over
	// backwards here. We need to find the Store, but all we have is the
	// Key. So find its Range locally. This lets us use the same
	// codepath below (store.ExecuteCmd) for both locally and remotely
	// originated commands.
	header := args.Header()
	if header.Replica.StoreID == 0 {
		var repl *proto.Replica
		repl, err = kv.lookupReplica(header.Key, header.EndKey)
		if err == nil {
			header.Replica = *repl
	if err == nil {
		store, err = kv.GetStore(header.Replica.StoreID)
	reply := reflect.New(reflect.TypeOf(replyChan).Elem().Elem()).Interface().(proto.Response)
	if err != nil {
	} else {
		store.ExecuteCmd(method, args, reply)
		if err := reply.Verify(args); err != nil {
Example #3
// ExecuteCmd synchronously runs Store.ExecuteCmd. The store is looked
// up from the store map if specified by header.Replica; otherwise,
// the command is being executed locally, and the replica is
// determined via lookup of header.Key in the ranges slice.
func (kv *LocalKV) ExecuteCmd(method string, args proto.Request, replyChan interface{}) {
	// If the replica isn't specified in the header, look it up.
	var err error
	var store *storage.Store
	// If we aren't given a Replica, then a little bending over
	// backwards here. We need to find the Store, but all we have is the
	// Key. So find its Range locally, and pull out its Replica which we
	// use to find the Store. This lets us use the same codepath below
	// (store.ExecuteCmd) for both locally and remotely originated
	// commands.
	header := args.Header()
	if header.Replica.NodeID == 0 {
		if repl := kv.lookupReplica(header.Key); repl != nil {
			header.Replica = *repl
		} else {
			err = util.Errorf("unable to lookup range replica for key %q", string(header.Key))
	if err == nil {
		store, err = kv.GetStore(&header.Replica)
	reply := reflect.New(reflect.TypeOf(replyChan).Elem().Elem()).Interface().(proto.Response)
	if err != nil {
	} else {
		store.ExecuteCmd(method, args, reply)
Example #4
// ReadOnlyCmd updates the read timestamp cache and waits for any
// overlapping writes currently processing through Raft ahead of us to
// clear via the read queue.
func (r *Range) ReadOnlyCmd(method string, args proto.Request, reply proto.Response) error {
	header := args.Header()
	r.tsCache.Add(header.Key, header.EndKey, header.Timestamp)
	var wg sync.WaitGroup
	r.readQ.AddRead(header.Key, header.EndKey, &wg)

	// It's possible that arbitrary delays (e.g. major GC, VM
	// de-prioritization, etc.) could cause the execution of this read
	// command to occur AFTER the range replica has lost leadership.
	// There is a chance that we waited on writes, and although they
	// were committed to the log, they weren't successfully applied to
	// this replica's state machine. We re-verify leadership before
	// reading to make sure that all pending writes are persisted.
	// There are some elaborate cases where we might have lost
	// leadership and then regained it during the delay, but this is ok
	// because any writes during that period necessarily had higher
	// timestamps. This is because the read-timestamp-cache prevents it
	// for the active leader and leadership changes force the
	// read-timestamp-cache to reset its high water mark.
	if !r.IsLeader() {
		// TODO(spencer): when we happen to know the leader, fill it in here via replica.
		return &proto.NotLeaderError{}
	return r.executeCmd(method, args, reply)
Example #5
// ReadWriteCmd first consults the response cache to determine whether
// this command has already been sent to the range. If a response is
// found, it's returned immediately and not submitted to raft. Next,
// the timestamp cache is checked to determine if any newer accesses to
// this command's affected keys have been made. If so, this command's
// timestamp is moved forward. Finally the keys affected by this
// command are added as pending writes to the read queue and the
// command is submitted to Raft. Upon completion, the write is removed
// from the read queue and the reply is added to the repsonse cache.
func (r *Range) ReadWriteCmd(method string, args proto.Request, reply proto.Response) error {
	// Check the response cache in case this is a replay. This call
	// may block if the same command is already underway.
	header := args.Header()
	if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok || err != nil {
		if ok { // this is a replay! extract error for return
			return reply.Header().GoError()
		// In this case there was an error reading from the response
		// cache. Instead of failing the request just because we can't
		// decode the reply in the response cache, we proceed as though
		// idempotence has expired.
		log.Errorf("unable to read result for %+v from the response cache: %v", args, err)

	// One of the prime invariants of Cockroach is that a mutating command
	// cannot write a key with an earlier timestamp than the most recent
	// read of the same key. So first order of business here is to check
	// the timestamp cache for reads/writes which are more recent than the
	// timestamp of this write. If more recent, we simply update the
	// write's timestamp before enqueuing it for execution. When the write
	// returns, the updated timestamp will inform the final commit
	// timestamp.
	r.Lock() // Protect access to timestamp cache and read queue.
	if ts := r.tsCache.GetMax(header.Key, header.EndKey); header.Timestamp.Less(ts) {
		if glog.V(1) {
			glog.Infof("Overriding existing timestamp %s with %s", header.Timestamp, ts)
		ts.Logical++ // increment logical component by one to differentiate.
		// Update the request timestamp.
		header.Timestamp = ts
	// Just as for reads, we update the timestamp cache with the
	// timestamp of this write. This ensures a strictly higher timestamp
	// for successive writes to the same key or key range.
	r.tsCache.Add(header.Key, header.EndKey, header.Timestamp)

	// The next step is to add the write to the read queue to inform
	// subsequent reads that there is a pending write. Reads which
	// overlap pending writes must wait for those writes to complete.
	wKey := r.readQ.AddWrite(header.Key, header.EndKey)

	// Create command and enqueue for Raft.
	cmd := &Cmd{
		Method: method,
		Args:   args,
		Reply:  reply,
		done:   make(chan error, 1),
	// This waits for the command to complete.
	err := r.EnqueueCmd(cmd)

	// Now that the command has completed, remove the pending write.

	return err
Example #6
// MaybeWrap wraps the given argument in a batch, unless it is already one.
func maybeWrap(args proto.Request) (*proto.BatchRequest, func(*proto.BatchResponse) proto.Response) {
	if ba, ok := args.(*proto.BatchRequest); ok {
		return ba, func(br *proto.BatchResponse) proto.Response { return br }
	ba := &proto.BatchRequest{}
	ba.RequestHeader = *(gogoproto.Clone(args.Header()).(*proto.RequestHeader))
	return ba, func(br *proto.BatchResponse) proto.Response {
		var unwrappedReply proto.Response
		if len(br.Responses) == 0 {
			unwrappedReply = args.CreateReply()
		} else {
			unwrappedReply = br.Responses[0].GetInner()
		// The ReplyTxn is propagated from one response to the next request,
		// and we adopt the mechanism that whenever the Txn changes, it needs
		// to be set in the reply, for example to ratched up the transaction
		// timestamp on writes when necessary.
		// This is internally necessary to sequentially execute the batch,
		// so it makes some sense to take the burden of updating the Txn
		// from TxnCoordSender - it will only need to act on retries/aborts
		// in the future.
		unwrappedReply.Header().Txn = br.Txn
		if unwrappedReply.Header().Error == nil {
			unwrappedReply.Header().Error = br.Error
		return unwrappedReply
Example #7
// executeCmd switches over the method and multiplexes to execute the
// appropriate storage API command.
func (r *Range) executeCmd(method string, args proto.Request, reply proto.Response) error {
	switch method {
	case Contains:
		r.Contains(args.(*proto.ContainsRequest), reply.(*proto.ContainsResponse))
	case Get:
		r.Get(args.(*proto.GetRequest), reply.(*proto.GetResponse))
	case Put:
		r.Put(args.(*proto.PutRequest), reply.(*proto.PutResponse))
	case ConditionalPut:
		r.ConditionalPut(args.(*proto.ConditionalPutRequest), reply.(*proto.ConditionalPutResponse))
	case Increment:
		r.Increment(args.(*proto.IncrementRequest), reply.(*proto.IncrementResponse))
	case Delete:
		r.Delete(args.(*proto.DeleteRequest), reply.(*proto.DeleteResponse))
	case DeleteRange:
		r.DeleteRange(args.(*proto.DeleteRangeRequest), reply.(*proto.DeleteRangeResponse))
	case Scan:
		r.Scan(args.(*proto.ScanRequest), reply.(*proto.ScanResponse))
	case EndTransaction:
		r.EndTransaction(args.(*proto.EndTransactionRequest), reply.(*proto.EndTransactionResponse))
	case AccumulateTS:
		r.AccumulateTS(args.(*proto.AccumulateTSRequest), reply.(*proto.AccumulateTSResponse))
	case ReapQueue:
		r.ReapQueue(args.(*proto.ReapQueueRequest), reply.(*proto.ReapQueueResponse))
	case EnqueueUpdate:
		r.EnqueueUpdate(args.(*proto.EnqueueUpdateRequest), reply.(*proto.EnqueueUpdateResponse))
	case EnqueueMessage:
		r.EnqueueMessage(args.(*proto.EnqueueMessageRequest), reply.(*proto.EnqueueMessageResponse))
	case InternalRangeLookup:
		r.InternalRangeLookup(args.(*proto.InternalRangeLookupRequest), reply.(*proto.InternalRangeLookupResponse))
	case InternalHeartbeatTxn:
		r.InternalHeartbeatTxn(args.(*proto.InternalHeartbeatTxnRequest), reply.(*proto.InternalHeartbeatTxnResponse))
	case InternalPushTxn:
		r.InternalPushTxn(args.(*proto.InternalPushTxnRequest), reply.(*proto.InternalPushTxnResponse))
	case InternalResolveIntent:
		r.InternalResolveIntent(args.(*proto.InternalResolveIntentRequest), reply.(*proto.InternalResolveIntentResponse))
	case InternalSnapshotCopy:
		r.InternalSnapshotCopy(args.(*proto.InternalSnapshotCopyRequest), reply.(*proto.InternalSnapshotCopyResponse))
		return util.Errorf("unrecognized command type: %s", method)

	// Propagate the request timestamp (which may have changed).
	reply.Header().Timestamp = args.Header().Timestamp

	// Add this command's result to the response cache if this is a
	// read/write method. This must be done as part of the execution of
	// raft commands so that every replica maintains the same responses
	// to continue request idempotence when leadership changes.
	if !IsReadOnly(method) {
		if putErr := r.respCache.PutResponse(args.Header().CmdID, reply); putErr != nil {
			log.Errorf("unable to write result of %+v: %+v to the response cache: %v",
				args, reply, putErr)

	// Return the error (if any) set in the reply.
	return reply.Header().GoError()
Example #8
// executeCmd looks up the store specified by header.Replica, and runs
// Store.ExecuteCmd.
func (n *Node) executeCmd(method string, args proto.Request, reply proto.Response) error {
	store, err := n.localKV.GetStore(&args.Header().Replica)
	if err != nil {
		return err
	store.ExecuteCmd(method, args, reply)
	return nil
Example #9
// endCmd removes a pending command from the command queue.
func (r *Range) endCmd(cmdKey interface{}, args proto.Request, err error, readOnly bool) {
	if err == nil && usesTimestampCache(args) {
		header := args.Header()
		r.tsCache.Add(header.Key, header.EndKey, header.Timestamp, header.Txn.GetID(), readOnly)
Example #10
func (db *testDB) executeCmd(method string, args proto.Request, replyChan interface{}) {
	reply := reflect.New(reflect.TypeOf(replyChan).Elem().Elem()).Interface().(proto.Response)
	if rng := db.store.LookupRange(args.Header().Key, args.Header().EndKey); rng != nil {
		args.Header().Replica = *rng.Meta.GetReplica()
		db.store.ExecuteCmd(method, args, reply)
	} else {
		reply.Header().SetGoError(proto.NewRangeKeyMismatchError(args.Header().Key, args.Header().EndKey, nil))
Example #11
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) error {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")

	// If we have an out of order index, there's corruption. No sense in trying
	// to update anything or run the command. Simply return a corruption error.
	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))

	// Call the helper, which returns a batch containing data written
	// during command execution and any associated error.
	ms := engine.MVCCStats{}
	batch, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, reply, &ms)
	// ALWAYS set the reply header error to the error returned by the
	// helper. This is the definitive result of the execution. The
	// error must be set before saving to the response cache.
	// TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not
	// touch the reply header's error field.
	defer batch.Close()

	// Advance the last applied index and commit the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err)
	if err := batch.Commit(); err != nil {
		rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr)
	} else {
		// Update cached appliedIndex if we were able to set the applied index on disk.
		atomic.StoreUint64(&r.appliedIndex, index)

	// On successful write commands, flush to event feed, and handle other
	// write-related triggers including splitting and config gossip updates.
	if rErr == nil && proto.IsWrite(args) {
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// If the commit succeeded, potentially add range to split queue.
		// Maybe update gossip configs on a put.
		switch args.(type) {
		case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest:
			if key := args.Header().Key; key.Less(keys.SystemMax) {
				// We hold the lock already.
				r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool {
					return bytes.HasPrefix(key, configPrefix)

	return rErr
Example #12
// UpdateForBatch updates the first argument (the header of a request contained
// in a batch) from the second one (the batch header), returning an error when
// inconsistencies are found.
// It is checked that the individual call does not have a UserPriority
// or Txn set that differs from the batch's.
// TODO(tschottdorf): will go with #2143.
func updateForBatch(args proto.Request, bHeader proto.RequestHeader) error {
	// Disallow transaction, user and priority on individual calls, unless
	// equal.
	aHeader := args.Header()
	if aPrio := aHeader.GetUserPriority(); aPrio != proto.Default_RequestHeader_UserPriority && aPrio != bHeader.GetUserPriority() {
		return util.Errorf("conflicting user priority on call in batch")
	aHeader.UserPriority = bHeader.UserPriority
	aHeader.Txn = bHeader.Txn // reqs always take Txn from batch
	return nil
Example #13
// sendAttempt is invoked by Send. It temporarily truncates the arguments to
// match the descriptor's EndKey (if necessary) and gathers and rearranges the
// replicas before making a single attempt at sending the request. It returns
// the result of sending the RPC; a potential error contained in the reply has
// to be handled separately by the caller.
func (ds *DistSender) sendAttempt(trace *tracer.Trace, args proto.Request, reply proto.Response, desc *proto.RangeDescriptor) error {
	defer trace.Epoch("sending RPC")()
	// Truncate the request to our current range, making sure not to
	// touch it unless we have to (it is illegal to send EndKey on
	// commands which do not operate on ranges).
	if endKey := args.Header().EndKey; endKey != nil && !endKey.Less(desc.EndKey) {
		defer func(k proto.Key) { args.Header().EndKey = k }(endKey)
		args.Header().EndKey = desc.EndKey
	leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID))

	// Try to send the call.
	replicas := newReplicaSlice(ds.gossip, desc)

	// Rearrange the replicas so that those replicas with long common
	// prefix of attributes end up first. If there's no prefix, this is a
	// no-op.
	order := ds.optimizeReplicaOrder(replicas)

	// If this request needs to go to a leader and we know who that is, move
	// it to the front.
	if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) &&
		leader.StoreID > 0 {
		if i := replicas.FindReplica(leader.StoreID); i >= 0 {
			order = rpc.OrderStable

	return ds.sendRPC(trace, desc.RaftID, replicas, order, args, reply)
Example #14
// addReadOnlyCmd updates the read timestamp cache and waits for any
// overlapping writes currently processing through Raft ahead of us to
// clear via the read queue.
func (r *Range) addReadOnlyCmd(ctx context.Context, args proto.Request, reply proto.Response) error {
	header := args.Header()

	if err := r.checkCmdHeader(header); err != nil {
		return err

	// If read-consistency is set to INCONSISTENT, run directly.
	if header.ReadConsistency == proto.INCONSISTENT {
		// But disallow any inconsistent reads within txns.
		if header.Txn != nil {
			reply.Header().SetGoError(util.Error("cannot allow inconsistent reads within a transaction"))
			return reply.Header().GoError()
		if header.Timestamp.Equal(proto.ZeroTimestamp) {
			header.Timestamp = r.rm.Clock().Now()
		intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply)
		if err == nil {
			r.handleSkippedIntents(args, intents)
		return err
	} else if header.ReadConsistency == proto.CONSENSUS {
		reply.Header().SetGoError(util.Error("consensus reads not implemented"))
		return reply.Header().GoError()

	// Add the read to the command queue to gate subsequent
	// overlapping commands until this command completes.
	cmdKey := r.beginCmd(header, true)

	// This replica must have leader lease to process a consistent read.
	if err := r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), header.Timestamp); err != nil {
		r.endCmd(cmdKey, args, err, true /* readOnly */)
		return err

	// Execute read-only command.
	intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply)

	// Only update the timestamp cache if the command succeeded.
	r.endCmd(cmdKey, args, err, true /* readOnly */)

	if err == nil {
		r.handleSkippedIntents(args, intents)
	return err
Example #15
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request) (proto.Response, error) {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")

	// If we have an out of order index, there's corruption. No sense in trying
	// to update anything or run the command. Simply return a corruption error.
	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return nil, newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))

	// Call the helper, which returns a batch containing data written
	// during command execution and any associated error.
	ms := engine.MVCCStats{}
	batch, reply, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, &ms)
	defer batch.Close()

	// Advance the last applied index and commit the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err)
	if err := batch.Commit(); err != nil {
		rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr)
	} else {
		// Update cached appliedIndex if we were able to set the applied index on disk.
		atomic.StoreUint64(&r.appliedIndex, index)

	// On successful write commands, flush to event feed, and handle other
	// write-related triggers including splitting and config gossip updates.
	if rErr == nil && proto.IsWrite(args) {
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// If the commit succeeded, potentially add range to split queue.
		// Maybe update gossip configs if the command is not part of a transaction.
		// If the command is part of an uncommitted transaction, we rely on the
		// periodic configGossipInterval loop since we will not see the update
		// until the transaction is committed.
		if key := args.Header().Key; key.Less(keys.SystemMax) && args.Header().Txn == nil {
			r.maybeGossipConfigs(func(configPrefix proto.Key) bool {
				return bytes.HasPrefix(key, configPrefix)

	return reply, rErr
Example #16
// executeCmd creates a proto.Call struct and sends it via our local sender.
func (n *nodeServer) executeCmd(args proto.Request, reply proto.Response) error {
	// TODO(tschottdorf) get a hold of the client's ID, add it to the
	// context before dispatching, and create an ID for tracing the request.
	header := args.Header()
	header.CmdID = header.GetOrCreateCmdID(n.ctx.Clock.PhysicalNow())
	trace := n.ctx.Tracer.NewTrace(header)
	defer trace.Finalize()
	defer trace.Epoch("node")()
	ctx := tracer.ToCtx((*Node)(n).context(), trace)

	n.lSender.Send(ctx, proto.Call{Args: args, Reply: reply})
	n.feed.CallComplete(args, reply)
	if err := reply.Header().GoError(); err != nil {
		trace.Event(fmt.Sprintf("error: %T", err))
	return nil
Example #17
// addAdminCmd executes the command directly. There is no interaction
// with the command queue or the timestamp cache, as admin commands
// are not meant to consistently access or modify the underlying data.
// Admin commands must run on the leader replica.
func (r *Range) addAdminCmd(ctx context.Context, args proto.Request, reply proto.Response) error {
	// Admin commands always require the leader lease.
	if err := r.redirectOnOrAcquireLeaderLease(args.Header().Timestamp); err != nil {
		return err

	switch args.(type) {
	case *proto.AdminSplitRequest:
		r.AdminSplit(args.(*proto.AdminSplitRequest), reply.(*proto.AdminSplitResponse))
	case *proto.AdminMergeRequest:
		r.AdminMerge(args.(*proto.AdminMergeRequest), reply.(*proto.AdminMergeResponse))
		return util.Error("unrecognized admin command")
	return reply.Header().GoError()
Example #18
// ExecuteCmd fetches a range based on the header's replica, assembles
// method, args & reply into a Raft Cmd struct and executes the
// command using the fetched range.
func (s *Store) ExecuteCmd(method string, args proto.Request, reply proto.Response) error {
	// If the request has a zero timestamp, initialize to this node's clock.
	header := args.Header()
	if header.Timestamp.WallTime == 0 && header.Timestamp.Logical == 0 {
		// Update both incoming and outgoing timestamps.
		now := s.clock.Now()
		args.Header().Timestamp = now
		reply.Header().Timestamp = now
	} else {
		// Otherwise, update our clock with the incoming request. This
		// advances the local node's clock to a high water mark from
		// amongst all nodes with which it has interacted. The update is
		// bounded by the max clock drift.
		_, err := s.clock.Update(header.Timestamp)
		if err != nil {
			return err

	// Verify specified range contains the command's implicated keys.
	rng, err := s.GetRange(header.Replica.RangeID)
	if err != nil {
		return err
	if !rng.ContainsKeyRange(header.Key, header.EndKey) {
		return proto.NewRangeKeyMismatchError(header.Key, header.EndKey, rng.Meta)
	if !rng.IsLeader() {
		// TODO(spencer): when we happen to know the leader, fill it in here via replica.
		return &proto.NotLeaderError{}

	// Differentiate between read-only and read-write.
	if IsReadOnly(method) {
		return rng.ReadOnlyCmd(method, args, reply)

	return rng.ReadWriteCmd(method, args, reply)
Example #19
// proposeRaftCommand prepares necessary pending command struct and
// initializes a client command ID if one hasn't been. It then
// proposes the command to Raft and returns the error channel and
// pending command struct for receiving.
func (r *Range) proposeRaftCommand(ctx context.Context, args proto.Request) (<-chan error, *pendingCmd) {
	pendingCmd := &pendingCmd{
		ctx:  ctx,
		done: make(chan responseWithErr, 1),
	raftCmd := proto.InternalRaftCommand{
		RaftID:       r.Desc().RaftID,
		OriginNodeID: r.rm.RaftNodeID(),
	cmdID := args.Header().GetOrCreateCmdID(r.rm.Clock().PhysicalNow())
	ok := raftCmd.Cmd.SetValue(args)
	if !ok {
		log.Fatalc(ctx, "unknown command type %T", args)
	idKey := makeCmdIDKey(cmdID)
	r.pendingCmds[idKey] = pendingCmd
	errChan := r.rm.ProposeRaftCommand(idKey, raftCmd)

	return errChan, pendingCmd
Example #20
// addAdminCmd executes the command directly. There is no interaction
// with the command queue or the timestamp cache, as admin commands
// are not meant to consistently access or modify the underlying data.
// Admin commands must run on the leader replica.
func (r *Range) addAdminCmd(ctx context.Context, args proto.Request) (proto.Response, error) {
	header := args.Header()

	if err := r.checkCmdHeader(header); err != nil {
		return nil, err

	// Admin commands always require the leader lease.
	if err := r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), header.Timestamp); err != nil {
		return nil, err

	switch tArgs := args.(type) {
	case *proto.AdminSplitRequest:
		resp, err := r.AdminSplit(tArgs)
		return &resp, err
	case *proto.AdminMergeRequest:
		resp, err := r.AdminMerge(tArgs)
		return &resp, err
		return nil, util.Error("unrecognized admin command")
Example #21
// Call invokes the KV command synchronously and returns the response
// and error, if applicable. If preceeding calls have been made to
// Prepare() without a call to Flush(), this call is prepared and
// then all prepared calls are flushed.
func (kv *KV) Call(method string, args proto.Request, reply proto.Response) error {
	if len(kv.prepared) > 0 {
		kv.Prepare(method, args, reply)
		return kv.Flush()
	if args.Header().User == "" {
		args.Header().User = kv.User
	if args.Header().UserPriority == nil && kv.UserPriority != 0 {
		args.Header().UserPriority = gogoproto.Int32(kv.UserPriority)
	call := &Call{
		Method: method,
		Args:   args,
		Reply:  reply,
	err := call.Reply.Header().GoError()
	if err != nil {
		log.Infof("failed %s: %s", call.Method, err)
	return err
Example #22
// ExecuteCmd verifies permissions and looks up the appropriate range
// based on the supplied key and sends the RPC according to the
// specified options. executeRPC sends asynchronously and returns a
// response value on the replyChan channel when the call is complete.
func (kv *DistKV) ExecuteCmd(method string, args proto.Request, replyChan interface{}) {
	// Augment method with "Node." prefix.
	method = "Node." + method

	// Verify permissions.
	if err := kv.verifyPermissions(method, args.Header()); err != nil {
		sendErrorReply(err, replyChan)

	// Retry logic for lookup of range by key and RPCs to range replicas.
	retryOpts := util.RetryOptions{
		Tag:         fmt.Sprintf("routing %s rpc", method),
		Backoff:     retryBackoff,
		MaxBackoff:  maxRetryBackoff,
		Constant:    2,
		MaxAttempts: 0, // retry indefinitely
	err := util.RetryWithBackoff(retryOpts, func() (bool, error) {
		desc, err := kv.rangeCache.LookupRangeMetadata(args.Header().Key)
		if err == nil {
			err = kv.sendRPC(desc, method, args, replyChan)
		if err != nil {
			// Range metadata might be out of date - evict it.

			// If retryable, allow outer loop to retry.
			if retryErr, ok := err.(util.Retryable); ok && retryErr.CanRetry() {
				log.Warningf("failed to invoke %s: %v", method, err)
				return false, nil
		return true, err
	if err != nil {
		sendErrorReply(err, replyChan)
Example #23
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) (rErr error) {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")

	committed := false
	// The very last thing we do before returning is move the applied index
	// forward, unless that has already happened as part of a successfully
	// committed batch.
	defer func() {
		if !committed {
			// We didn't commit the batch, but advance the last applied index nonetheless.
			if err := setAppliedIndex(r.rm.Engine(), r.Desc().RaftID, index); err != nil {
				rErr = newReplicaCorruptionError(
					util.Errorf("could not advance applied index"), err, rErr)
			atomic.StoreUint64(&r.appliedIndex, index)

	if lease := r.getLease(); args.Method() != proto.InternalLeaderLease &&
		(!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) {
		// Verify the leader lease is held, unless this command is trying to
		// obtain it. Any other Raft command has had the leader lease held
		// by the replica at proposal time, but this may no more be the case.
		// Corruption aside, the most likely reason is a leadership change (the
		// most recent leader assumes responsibility for all past timestamps as
		// well). In that case, it's not valid to go ahead with the execution:
		// Writes must be aware of the last time the mutated key was read, and
		// since reads are served locally by the lease holder without going
		// through Raft, a read which was not taken into account may have been
		// served. Hence, we must retry at the current leader.
		// It's crucial that we don't update the response cache for the error
		// returned below since the request is going to be retried with the
		// same ClientCmdID and would get the distributed sender stuck in an
		// infinite loop, retrieving a stale NotLeaderError over and over
		// again, even when proposing at the correct replica.
		return r.newNotLeaderError(lease)

	// Anything happening from now on needs to enter the response cache.
	defer func() {
		// TODO(tamird,tschottdorf): according to #1400 we intend to set the reply
		// header's error as late as possible and in a central location. Range
		// commands still write to the header directly, but once they don't this
		// could be the authoritative location that sets the reply error for any-
		// thing that makes it into Raft. Note that we must set this prior to
		// signaling cmd.done below, or the waiting RPC handler might proceed
		// before we've updated its reply.
		// It is important that the error is set before the reply is saved into
		// the response cache.

		if proto.IsWrite(args) {
			// No matter the result, add result to the response cache if this
			// is a write method. This must be done as part of the execution of
			// raft commands so that every replica maintains the same responses
			// to continue request idempotence, even if leadership changes.
			if err := r.respCache.PutResponse(args.Header().CmdID, reply); err != nil {
				rErr = newReplicaCorruptionError(
					util.Errorf("could not put to response cache"), err, rErr)

	header := args.Header()

	// Check the response cache to ensure idempotency.
	if proto.IsWrite(args) {
		if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok && err == nil {
			if log.V(1) {
				log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID)
			return err
		} else if ok && err != nil {
			return newReplicaCorruptionError(
				util.Errorf("could not read from response cache"), err)

	// Create a new batch for the command to ensure all or nothing semantics.
	batch := r.rm.Engine().NewBatch()
	defer batch.Close()

	// Create a engine.MVCCStats instance.
	ms := engine.MVCCStats{}

	// Execute the command; the error will also be set in the reply header.
	// TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not
	// touch the reply header's error field.
	intents, err := r.executeCmd(batch, &ms, args, reply)
	// If the execution of the command wasn't successful, stop here.
	if err != nil {
		return err

	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return newReplicaCorruptionError(
			util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))

	// Advance the applied index atomically within the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		return newReplicaCorruptionError(
			util.Errorf("could not update applied index"), err)

	if proto.IsWrite(args) {
		// On success, flush the MVCC stats to the batch and commit.
		if err := r.stats.MergeMVCCStats(batch, &ms, header.Timestamp.WallTime); err != nil {
			return newReplicaCorruptionError(util.Errorf("could not merge MVCC stats"), err)
		if err := batch.Commit(); err != nil {
			return newReplicaCorruptionError(util.Errorf("could not commit batch"), err)
		committed = true
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// After successful commit, update cached stats and appliedIndex value.
		atomic.StoreUint64(&r.appliedIndex, index)
		// If the commit succeeded, potentially add range to split queue.
		// Maybe update gossip configs on a put.
		switch args.(type) {
		case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest:
			if header.Key.Less(keys.SystemMax) {
				// We hold the lock already.
				r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool {
					return bytes.HasPrefix(header.Key, configPrefix)
	// On success and only on the replica on which this command originated,
	// resolve skipped intents asynchronously.
	if originNode == r.rm.RaftNodeID() {
		r.handleSkippedIntents(args, intents)

	return nil
Example #24
// addWriteCmd first consults the response cache to determine whether
// this command has already been sent to the range. If a response is
// found, it's returned immediately and not submitted to raft. Next,
// the timestamp cache is checked to determine if any newer accesses to
// this command's affected keys have been made. If so, this command's
// timestamp is moved forward. Finally the keys affected by this
// command are added as pending writes to the read queue and the
// command is submitted to Raft. Upon completion, the write is removed
// from the read queue and the reply is added to the response cache.
// If wait is true, will block until the command is complete.
func (r *Range) addWriteCmd(ctx context.Context, args proto.Request, reply proto.Response, wait bool) error {
	// Check the response cache in case this is a replay. This call
	// may block if the same command is already underway.
	header := args.Header()

	// Add the write to the command queue to gate subsequent overlapping
	// Commands until this command completes. Note that this must be
	// done before getting the max timestamp for the key(s), as
	// timestamp cache is only updated after preceding commands have
	// been run to successful completion.
	cmdKey := r.beginCmd(header, false)

	// This replica must have leader lease to process a write.
	if err := r.redirectOnOrAcquireLeaderLease(header.Timestamp); err != nil {
		r.endCmd(cmdKey, args, err, false /* !readOnly */)
		return err

	// Two important invariants of Cockroach: 1) encountering a more
	// recently written value means transaction restart. 2) values must
	// be written with a greater timestamp than the most recent read to
	// the same key. Check the timestamp cache for reads/writes which
	// are at least as recent as the timestamp of this write. For
	// writes, send WriteTooOldError; for reads, update the write's
	// timestamp. When the write returns, the updated timestamp will
	// inform the final commit timestamp.
	if usesTimestampCache(args) {
		rTS, wTS := r.tsCache.GetMax(header.Key, header.EndKey, header.Txn.GetID())

		// Always push the timestamp forward if there's been a read which
		// occurred after our txn timestamp.
		if !rTS.Less(header.Timestamp) {
			header.Timestamp = rTS.Next()
		// If there's a newer write timestamp...
		if !wTS.Less(header.Timestamp) {
			// If we're in a txn, set a write too old error in reply. We
			// still go ahead and try the write because we want to avoid
			// restarting the transaction in the event that there isn't an
			// intent or the intent can be pushed by us.
			if header.Txn != nil {
				err := &proto.WriteTooOldError{Timestamp: header.Timestamp, ExistingTimestamp: wTS}
			} else {
				// Otherwise, make sure we advance the request's timestamp.
				header.Timestamp = wTS.Next()

	errChan, pendingCmd := r.proposeRaftCommand(ctx, args, reply)

	// Create a completion func for mandatory cleanups which we either
	// run synchronously if we're waiting or in a goroutine otherwise.
	completionFunc := func() error {
		// First wait for raft to commit or abort the command.
		var err error
		if err = <-errChan; err == nil {
			// Next if the command was committed, wait for the range to apply it.
			err = <-pendingCmd.done
		} else if err == multiraft.ErrGroupDeleted {
			// This error needs to be converted appropriately so that
			// clients will retry.
			err = proto.NewRangeNotFoundError(r.Desc().RaftID)
		// As for reads, update timestamp cache with the timestamp
		// of this write on success. This ensures a strictly higher
		// timestamp for successive writes to the same key or key range.
		r.endCmd(cmdKey, args, err, false /* !readOnly */)
		return err

	if wait {
		return completionFunc()
	go func() {
		// If the original client didn't wait (e.g. resolve write intent),
		// log execution errors so they're surfaced somewhere.
		if err := completionFunc(); err != nil {
			// TODO(tschottdorf): possible security risk to log args.
			log.Warningc(ctx, "async execution of %v failed: %s", args, err)
	return nil
Example #25
// resetClientCmdID sets the client command ID if the call is for a
// read-write method. The client command ID provides idempotency
// protection in conjunction with the server.
func resetClientCmdID(args proto.Request) {
	args.Header().CmdID = proto.ClientCmdID{
		WallTime: time.Now().UnixNano(),
		Random:   rand.Int63(),
Example #26
// applyRaftCommandInBatch executes the command in a batch engine and
// returns the batch containing the results. The caller is responsible
// for committing the batch, even on error.
func (r *Range) applyRaftCommandInBatch(ctx context.Context, index uint64, originNode proto.RaftNodeID,
	args proto.Request, ms *engine.MVCCStats) (engine.Engine, proto.Response, error) {
	// Create a new batch for the command to ensure all or nothing semantics.
	batch := r.rm.Engine().NewBatch()

	if lease := r.getLease(); args.Method() != proto.InternalLeaderLease &&
		(!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) {
		// Verify the leader lease is held, unless this command is trying to
		// obtain it. Any other Raft command has had the leader lease held
		// by the replica at proposal time, but this may no longer be the case.
		// Corruption aside, the most likely reason is a leadership change (the
		// most recent leader assumes responsibility for all past timestamps as
		// well). In that case, it's not valid to go ahead with the execution:
		// Writes must be aware of the last time the mutated key was read, and
		// since reads are served locally by the lease holder without going
		// through Raft, a read which was not taken into account may have been
		// served. Hence, we must retry at the current leader.
		// It's crucial that we don't update the response cache for the error
		// returned below since the request is going to be retried with the
		// same ClientCmdID and would get the distributed sender stuck in an
		// infinite loop, retrieving a stale NotLeaderError over and over
		// again, even when proposing at the correct replica.
		return batch, nil, r.newNotLeaderError(lease, originNode)

	// Check the response cache to ensure idempotency.
	if proto.IsWrite(args) {
		if reply, err := r.respCache.GetResponse(batch, args.Header().CmdID); err != nil {
			// Any error encountered while fetching the response cache entry means corruption.
			return batch, reply, newReplicaCorruptionError(util.Errorf("could not read from response cache"), err)
		} else if reply != nil {
			if log.V(1) {
				log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID)
			// TODO(tamird): move this into the response cache itself
			defer func() { reply.Header().Error = nil }()
			// We successfully read from the response cache, so return whatever error
			// was present in the cached entry (if any).
			return batch, reply, reply.Header().GoError()

	// Execute the command.
	reply, intents, rErr := r.executeCmd(batch, ms, args)
	// Regardless of error, add result to the response cache if this is
	// a write method. This must be done as part of the execution of
	// raft commands so that every replica maintains the same responses
	// to continue request idempotence, even if leadership changes.
	if proto.IsWrite(args) {
		if rErr == nil {
			// If command was successful, flush the MVCC stats to the batch.
			if err := r.stats.MergeMVCCStats(batch, ms, args.Header().Timestamp.WallTime); err != nil {
				log.Fatalc(ctx, "setting mvcc stats in a batch should never fail: %s", err)
		} else {
			// Otherwise, reset the batch to clear out partial execution and
			// prepare for the failed response cache entry.
			batch = r.rm.Engine().NewBatch()
		// TODO(tamird): move this into the response cache itself
		if reply == nil {
			reply = args.CreateReply()
		if reply.Header().Error != nil {
			panic("the world is on fire")
		if err := r.respCache.PutResponse(batch, args.Header().CmdID, reply); err != nil {
			log.Fatalc(ctx, "putting a response cache entry in a batch should never fail: %s", err)
		reply.Header().Error = nil

	// If the execution of the command wasn't successful, stop here.
	if rErr != nil {
		return batch, reply, rErr

	// On success and only on the replica on which this command originated,
	// resolve skipped intents asynchronously.
	if originNode == r.rm.RaftNodeID() {
		r.handleSkippedIntents(args, intents)

	return batch, reply, nil
Example #27
// addWriteCmd first adds the keys affected by this command as pending writes
// to the command queue. Next, the timestamp cache is checked to determine if
// any newer accesses to this command's affected keys have been made. If so,
// the command's timestamp is moved forward. Finally, the command is submitted
// to Raft. Upon completion, the write is removed from the read queue and any
// error returned. If a WaitGroup is supplied, it is signaled when the command
// enters Raft or the function returns with a preprocessing error, whichever
// happens earlier.
func (r *Range) addWriteCmd(ctx context.Context, args proto.Request, wg *sync.WaitGroup) (proto.Response, error) {
	signal := func() {
		if wg != nil {
			wg = nil

	// This happens more eagerly below, but it's important to guarantee that
	// early returns do not skip this.
	defer signal()

	header := args.Header()

	if err := r.checkCmdHeader(args.Header()); err != nil {
		return nil, err

	trace := tracer.FromCtx(ctx)

	// Add the write to the command queue to gate subsequent overlapping
	// Commands until this command completes. Note that this must be
	// done before getting the max timestamp for the key(s), as
	// timestamp cache is only updated after preceding commands have
	// been run to successful completion.
	qDone := trace.Epoch("command queue")
	cmdKey := r.beginCmd(header, false)

	// This replica must have leader lease to process a write.
	if err := r.redirectOnOrAcquireLeaderLease(trace, header.Timestamp); err != nil {
		r.endCmd(cmdKey, args, err, false /* !readOnly */)
		return nil, err

	// Two important invariants of Cockroach: 1) encountering a more
	// recently written value means transaction restart. 2) values must
	// be written with a greater timestamp than the most recent read to
	// the same key. Check the timestamp cache for reads/writes which
	// are at least as recent as the timestamp of this write. For
	// writes, send WriteTooOldError; for reads, update the write's
	// timestamp. When the write returns, the updated timestamp will
	// inform the final commit timestamp.
	if usesTimestampCache(args) {
		rTS, wTS := r.tsCache.GetMax(header.Key, header.EndKey, header.Txn.GetID())

		// Always push the timestamp forward if there's been a read which
		// occurred after our txn timestamp.
		if !rTS.Less(header.Timestamp) {
			header.Timestamp = rTS.Next()
		// If there's a newer write timestamp...
		if !wTS.Less(header.Timestamp) {
			// If we're in a txn, we still go ahead and try the write since
			// we want to avoid restarting the transaction in the event that
			// there isn't an intent or the intent can be pushed by us.
			// If we're not in a txn, it's trivial to just advance our timestamp.
			if header.Txn == nil {
				header.Timestamp = wTS.Next()

	defer trace.Epoch("raft")()

	errChan, pendingCmd := r.proposeRaftCommand(ctx, args)


	// First wait for raft to commit or abort the command.
	var err error
	var reply proto.Response
	if err = <-errChan; err == nil {
		// Next if the command was committed, wait for the range to apply it.
		respWithErr := <-pendingCmd.done
		reply, err = respWithErr.reply, respWithErr.err
	} else if err == multiraft.ErrGroupDeleted {
		// This error needs to be converted appropriately so that
		// clients will retry.
		err = proto.NewRangeNotFoundError(r.Desc().RaftID)
	// As for reads, update timestamp cache with the timestamp
	// of this write on success. This ensures a strictly higher
	// timestamp for successive writes to the same key or key range.
	r.endCmd(cmdKey, args, err, false /* !readOnly */)
	return reply, err
Example #28
// verifyPermissions verifies that the requesting user (header.User)
// has permission to read/write (capabilities depend on method
// name). In the event that multiple permission configs apply to the
// key range implicated by the command, the lowest common denominator
// for permission. For example, if a scan crosses two permission
// configs, both configs must allow read permissions or the entire
// scan will fail.
func (ds *DistSender) verifyPermissions(args proto.Request) error {
	// The root user can always proceed.
	header := args.Header()
	if header.User == storage.UserRoot {
		return nil
	// Check for admin methods.
	if proto.IsAdmin(args) {
		if header.User != storage.UserRoot {
			return util.Errorf("user %q cannot invoke admin command %s", header.User, args.Method())
		return nil
	// Get permissions map from gossip.
	configMap, err := ds.gossip.GetInfo(gossip.KeyConfigPermission)
	if err != nil {
		return util.Errorf("permissions not available via gossip")
	if configMap == nil {
		return util.Errorf("perm configs not available; cannot execute %s", args.Method())
	permMap := configMap.(storage.PrefixConfigMap)
	headerEnd := header.EndKey
	if len(headerEnd) == 0 {
		headerEnd = header.Key
	// Visit PermConfig(s) which apply to the method's key range.
	//   - For each perm config which the range covers, verify read or writes
	//     are allowed as method requires.
	//   - Verify the permissions hierarchically; that is, if permissions aren't
	//     granted at the longest prefix, try next longest, then next, etc., up
	//     to and including the default prefix.
	// TODO(spencer): it might make sense to visit prefixes from the
	//   shortest to longest instead for performance. Keep an eye on profiling
	//   for this code path as permission sets grow large.
	return permMap.VisitPrefixes(header.Key, headerEnd,
		func(start, end proto.Key, config interface{}) (bool, error) {
			hasPerm := false
			if err := permMap.VisitPrefixesHierarchically(start, func(start, end proto.Key, config interface{}) (bool, error) {
				perm := config.(*proto.PermConfig)
				if proto.IsRead(args) && !perm.CanRead(header.User) {
					return false, nil
				if proto.IsWrite(args) && !perm.CanWrite(header.User) {
					return false, nil
				// Return done = true, as permissions have been granted by this config.
				hasPerm = true
				return true, nil
			}); err != nil {
				return false, err
			if !hasPerm {
				if len(header.EndKey) == 0 {
					return false, util.Errorf("user %q cannot invoke %s at %q", header.User, args.Method(), start)
				return false, util.Errorf("user %q cannot invoke %s at %q-%q", header.User, args.Method(), start, end)
			return false, nil
Example #29
// sendRPC sends one or more RPCs to replicas from the supplied proto.Replica
// slice. First, replicas which have gossiped addresses are corralled (and
// rearranged depending on proximity and whether the request needs to go to a
// leader) and then sent via rpc.Send, with requirement that one RPC to a
// server must succeed. Returns an RPC error if the request could not be sent.
// Note that the reply may contain a higher level error and must be checked in
// addition to the RPC error.
func (ds *DistSender) sendRPC(raftID proto.RaftID, replicas replicaSlice, order rpc.OrderingPolicy,
	args proto.Request, reply proto.Response) error {
	if len(replicas) == 0 {
		return util.Errorf("%s: replicas set is empty", args.Method())

	// Build a slice of replica addresses (if gossiped).
	var addrs []net.Addr
	replicaMap := map[string]*proto.Replica{}
	for i := range replicas {
		nd := &replicas[i].NodeDesc
		addr := util.MakeUnresolvedAddr(nd.Address.Network, nd.Address.Address)
		addrs = append(addrs, addr)
		replicaMap[addr.String()] = &replicas[i].Replica
	if len(addrs) == 0 {
		return noNodeAddrsAvailError{}

	// TODO(pmattis): This needs to be tested. If it isn't set we'll
	// still route the request appropriately by key, but won't receive
	// RangeNotFoundErrors.
	args.Header().RaftID = raftID

	// Set RPC opts with stipulation that one of N RPCs must succeed.
	rpcOpts := rpc.Options{
		N:               1,
		Ordering:        order,
		SendNextTimeout: defaultSendNextTimeout,
		Timeout:         defaultRPCTimeout,
	// getArgs clones the arguments on demand for all but the first replica.
	firstArgs := true
	getArgs := func(addr net.Addr) interface{} {
		var a proto.Request
		// Use the supplied args proto if this is our first address.
		if firstArgs {
			firstArgs = false
			a = args
		} else {
			// Otherwise, copy the args value and set the replica in the header.
			a = gogoproto.Clone(args).(proto.Request)
		a.Header().Replica = *replicaMap[addr.String()]
		return a
	// RPCs are sent asynchronously and there is no synchronized access to
	// the reply object, so we don't pass itself to rpcSend.
	// Otherwise there maybe a race case:
	// If the RPC call times out using our original reply object,
	// we must not use it any more; the rpc call might still return
	// and just write to it at any time.
	// args.CreateReply() should be cheaper than gogoproto.Clone which use reflect.
	getReply := func() interface{} {
		return args.CreateReply()

	replies, err := ds.rpcSend(rpcOpts, "Node."+args.Method().String(),
		addrs, getArgs, getReply, ds.gossip.RPCContext)
	if err == nil {
		// Set content of replies[0] back to reply
		dst := reflect.ValueOf(reply).Elem()

	return err
Example #30
// resolveIntents resolves the given intents. For those which are local to the
// range, we submit directly to the range-local Raft instance; the call returns
// as soon as all resolve commands have been **proposed** (not executed). This
// ensures that if a waiting client retries immediately after conflict
// resolution, it will not hit the same intents again. All non-local intents
// are resolved asynchronously in a batch.
// TODO(tschottdorf): once Txn records have a list of possibly open intents,
// resolveIntents should send an RPC to update the transaction(s) as well (for
// those intents with non-pending Txns).
func (r *Replica) resolveIntents(ctx context.Context, intents []proto.Intent) {
	trace := tracer.FromCtx(ctx)
	tracer.ToCtx(ctx, nil) // we're doing async stuff below; those need new traces
	trace.Event("resolving intents [async]")
	var wg sync.WaitGroup

	bArgs := &proto.BatchRequest{}
	bArgs.User = security.RootUser
	for i := range intents {
		intent := intents[i] // avoids a race in `i, intent := range ...`
		var resolveArgs proto.Request
		var local bool // whether this intent lives on this Range
			header := proto.RequestHeader{
				// Use the pushee's timestamp, which might be lower than the
				// pusher's request timestamp. No need to push the intent higher
				// than the pushee's txn!
				Timestamp: intent.Txn.Timestamp,
				Key:       intent.Key,
				EndKey:    intent.EndKey,
				User:      security.RootUser,
				Txn:       &intent.Txn,

			if len(intent.EndKey) == 0 {
				resolveArgs = &proto.ResolveIntentRequest{RequestHeader: header}
				local = r.ContainsKey(intent.Key)
			} else {
				resolveArgs = &proto.ResolveIntentRangeRequest{RequestHeader: header}
				local = r.ContainsKeyRange(intent.Key, intent.EndKey)

		// If the intent isn't (completely) local, we'll need to send an external request.
		// We'll batch them all up and send at the end.
		if !local {

		// If it is local, it goes directly into Raft.
		// TODO(tschottdorf): this may be premature optimization. Consider just
		// treating everything as an external request. This means having to
		// wait for complete execution of the command (whereas now we just wait
		// for proposition) and some more overhead sending things around.
		action := func() {
			// Trace this under the ID of the intent owner.
			ctx := tracer.ToCtx(ctx, r.rm.Tracer().NewTrace(resolveArgs.Header().Txn))
			if _, err := r.addWriteCmd(ctx, resolveArgs, &wg); err != nil && log.V(1) {
				log.Warningc(ctx, "resolve for key %s failed: %s", intent.Key, err)
		if !r.rm.Stopper().RunAsyncTask(action) {
			// Still run the task. Our caller already has a task and going async
			// here again is merely for performance, but some intents need to
			// be resolved because they might block other tasks. See #1684.
			// Note that handleSkippedIntents has a TODO in case #1684 comes
			// back.
	// Resolve all of the intents which aren't local to the Range. This is a
	// no-op if all are local.
	b := &client.Batch{}
	b.InternalAddCall(proto.Call{Args: bArgs, Reply: &proto.BatchResponse{}})
	action := func() {
		// TODO(tschottdorf): no tracing here yet. Probably useful at some point,
		// but needs a) the corresponding interface and b) facilities for tracing
		// multiple tracees at the same time (batch full of possibly individual
		// txns).
		if err := r.rm.DB().Run(b); err != nil {
			if log.V(1) {
				log.Infoc(ctx, "%s", err)
	if !r.rm.Stopper().RunAsyncTask(action) {
		// As with local intents, try async to not keep the caller waiting, but
		// when draining just go ahead and do it synchronously. See #1684.

	// Wait until all the local `ResolveIntent`s have been submitted to raft.
	// No-op if all were external.