// usesTimestampCache returns true if the request affects or is // affected by the timestamp cache. func usesTimestampCache(r proto.Request) bool { m := r.Method() if m < 0 || m >= proto.Method(len(tsCacheMethods)) { return false } return tsCacheMethods[m] }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) error { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } // If we have an out of order index, there's corruption. No sense in trying // to update anything or run the command. Simply return a corruption error. if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Call the helper, which returns a batch containing data written // during command execution and any associated error. ms := engine.MVCCStats{} batch, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, reply, &ms) // ALWAYS set the reply header error to the error returned by the // helper. This is the definitive result of the execution. The // error must be set before saving to the response cache. // TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not // touch the reply header's error field. reply.Header().SetGoError(rErr) defer batch.Close() // Advance the last applied index and commit the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err) } if err := batch.Commit(); err != nil { rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr) } else { // Update cached appliedIndex if we were able to set the applied index on disk. atomic.StoreUint64(&r.appliedIndex, index) } // On successful write commands, flush to event feed, and handle other // write-related triggers including splitting and config gossip updates. if rErr == nil && proto.IsWrite(args) { // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs on a put. switch args.(type) { case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest: if key := args.Header().Key; key.Less(keys.SystemMax) { // We hold the lock already. r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool { return bytes.HasPrefix(key, configPrefix) }) } } } return rErr }
// CallComplete is called by a node whenever it completes a request. This will // publish an appropriate event to the feed based on the results of the call. func (nef NodeEventFeed) CallComplete(args proto.Request, reply proto.Response) { if err := reply.Header().Error; err != nil && err.CanRestartTransaction() == proto.TransactionRestart_ABORT { nef.f.Publish(&CallErrorEvent{ NodeID: nef.id, Method: args.Method(), }) } else { nef.f.Publish(&CallSuccessEvent{ NodeID: nef.id, Method: args.Method(), }) } }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request) (proto.Response, error) { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } // If we have an out of order index, there's corruption. No sense in trying // to update anything or run the command. Simply return a corruption error. if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return nil, newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Call the helper, which returns a batch containing data written // during command execution and any associated error. ms := engine.MVCCStats{} batch, reply, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, &ms) defer batch.Close() // Advance the last applied index and commit the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err) } if err := batch.Commit(); err != nil { rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr) } else { // Update cached appliedIndex if we were able to set the applied index on disk. atomic.StoreUint64(&r.appliedIndex, index) } // On successful write commands, flush to event feed, and handle other // write-related triggers including splitting and config gossip updates. if rErr == nil && proto.IsWrite(args) { // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs if the command is not part of a transaction. // If the command is part of an uncommitted transaction, we rely on the // periodic configGossipInterval loop since we will not see the update // until the transaction is committed. if key := args.Header().Key; key.Less(keys.SystemMax) && args.Header().Txn == nil { r.maybeGossipConfigs(func(configPrefix proto.Key) bool { return bytes.HasPrefix(key, configPrefix) }) } } return reply, rErr }
// CallComplete is called by a node whenever it completes a request. This will // publish an appropriate event to the feed based on the results of the call. // TODO(tschottdorf): move to batch, account for multiple methods per batch. // In particular, on error want an error position to identify the failed // request. func (nef NodeEventFeed) CallComplete(args proto.Request, reply proto.Response) { method := args.Method() if ba, ok := args.(*proto.BatchRequest); ok && len(ba.Requests) > 0 { method = ba.Requests[0].GetInner().Method() } if err := reply.Header().Error; err != nil && err.TransactionRestart == proto.TransactionRestart_ABORT { nef.f.Publish(&CallErrorEvent{ NodeID: nef.id, Method: method, }) } else { nef.f.Publish(&CallSuccessEvent{ NodeID: nef.id, Method: method, }) } }
// sendRPC sends one or more RPCs to replicas from the supplied proto.Replica // slice. First, replicas which have gossiped addresses are corralled (and // rearranged depending on proximity and whether the request needs to go to a // leader) and then sent via rpc.Send, with requirement that one RPC to a // server must succeed. Returns an RPC error if the request could not be sent. // Note that the reply may contain a higher level error and must be checked in // addition to the RPC error. func (ds *DistSender) sendRPC(raftID proto.RaftID, replicas replicaSlice, order rpc.OrderingPolicy, args proto.Request, reply proto.Response) error { if len(replicas) == 0 { return util.Errorf("%s: replicas set is empty", args.Method()) } // Build a slice of replica addresses (if gossiped). var addrs []net.Addr replicaMap := map[string]*proto.Replica{} for i := range replicas { nd := &replicas[i].NodeDesc addr := util.MakeUnresolvedAddr(nd.Address.Network, nd.Address.Address) addrs = append(addrs, addr) replicaMap[addr.String()] = &replicas[i].Replica } if len(addrs) == 0 { return noNodeAddrsAvailError{} } // TODO(pmattis): This needs to be tested. If it isn't set we'll // still route the request appropriately by key, but won't receive // RangeNotFoundErrors. args.Header().RaftID = raftID // Set RPC opts with stipulation that one of N RPCs must succeed. rpcOpts := rpc.Options{ N: 1, Ordering: order, SendNextTimeout: defaultSendNextTimeout, Timeout: defaultRPCTimeout, } // getArgs clones the arguments on demand for all but the first replica. firstArgs := true getArgs := func(addr net.Addr) interface{} { var a proto.Request // Use the supplied args proto if this is our first address. if firstArgs { firstArgs = false a = args } else { // Otherwise, copy the args value and set the replica in the header. a = gogoproto.Clone(args).(proto.Request) } a.Header().Replica = *replicaMap[addr.String()] return a } // RPCs are sent asynchronously and there is no synchronized access to // the reply object, so we don't pass itself to rpcSend. // Otherwise there maybe a race case: // If the RPC call times out using our original reply object, // we must not use it any more; the rpc call might still return // and just write to it at any time. // args.CreateReply() should be cheaper than gogoproto.Clone which use reflect. getReply := func() interface{} { return args.CreateReply() } replies, err := ds.rpcSend(rpcOpts, "Node."+args.Method().String(), addrs, getArgs, getReply, ds.gossip.RPCContext) if err == nil { // Set content of replies[0] back to reply dst := reflect.ValueOf(reply).Elem() dst.Set(reflect.ValueOf(replies[0]).Elem()) } return err }
// verifyPermissions verifies that the requesting user (header.User) // has permission to read/write (capabilities depend on method // name). In the event that multiple permission configs apply to the // key range implicated by the command, the lowest common denominator // for permission. For example, if a scan crosses two permission // configs, both configs must allow read permissions or the entire // scan will fail. func (ds *DistSender) verifyPermissions(args proto.Request) error { // The root user can always proceed. header := args.Header() if header.User == storage.UserRoot { return nil } // Check for admin methods. if proto.IsAdmin(args) { if header.User != storage.UserRoot { return util.Errorf("user %q cannot invoke admin command %s", header.User, args.Method()) } return nil } // Get permissions map from gossip. configMap, err := ds.gossip.GetInfo(gossip.KeyConfigPermission) if err != nil { return util.Errorf("permissions not available via gossip") } if configMap == nil { return util.Errorf("perm configs not available; cannot execute %s", args.Method()) } permMap := configMap.(storage.PrefixConfigMap) headerEnd := header.EndKey if len(headerEnd) == 0 { headerEnd = header.Key } // Visit PermConfig(s) which apply to the method's key range. // - For each perm config which the range covers, verify read or writes // are allowed as method requires. // - Verify the permissions hierarchically; that is, if permissions aren't // granted at the longest prefix, try next longest, then next, etc., up // to and including the default prefix. // // TODO(spencer): it might make sense to visit prefixes from the // shortest to longest instead for performance. Keep an eye on profiling // for this code path as permission sets grow large. return permMap.VisitPrefixes(header.Key, headerEnd, func(start, end proto.Key, config interface{}) (bool, error) { hasPerm := false if err := permMap.VisitPrefixesHierarchically(start, func(start, end proto.Key, config interface{}) (bool, error) { perm := config.(*proto.PermConfig) if proto.IsRead(args) && !perm.CanRead(header.User) { return false, nil } if proto.IsWrite(args) && !perm.CanWrite(header.User) { return false, nil } // Return done = true, as permissions have been granted by this config. hasPerm = true return true, nil }); err != nil { return false, err } if !hasPerm { if len(header.EndKey) == 0 { return false, util.Errorf("user %q cannot invoke %s at %q", header.User, args.Method(), start) } return false, util.Errorf("user %q cannot invoke %s at %q-%q", header.User, args.Method(), start, end) } return false, nil }) }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) (rErr error) { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } committed := false // The very last thing we do before returning is move the applied index // forward, unless that has already happened as part of a successfully // committed batch. defer func() { if !committed { // We didn't commit the batch, but advance the last applied index nonetheless. if err := setAppliedIndex(r.rm.Engine(), r.Desc().RaftID, index); err != nil { rErr = newReplicaCorruptionError( util.Errorf("could not advance applied index"), err, rErr) return } atomic.StoreUint64(&r.appliedIndex, index) } }() if lease := r.getLease(); args.Method() != proto.InternalLeaderLease && (!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) { // Verify the leader lease is held, unless this command is trying to // obtain it. Any other Raft command has had the leader lease held // by the replica at proposal time, but this may no more be the case. // Corruption aside, the most likely reason is a leadership change (the // most recent leader assumes responsibility for all past timestamps as // well). In that case, it's not valid to go ahead with the execution: // Writes must be aware of the last time the mutated key was read, and // since reads are served locally by the lease holder without going // through Raft, a read which was not taken into account may have been // served. Hence, we must retry at the current leader. // // It's crucial that we don't update the response cache for the error // returned below since the request is going to be retried with the // same ClientCmdID and would get the distributed sender stuck in an // infinite loop, retrieving a stale NotLeaderError over and over // again, even when proposing at the correct replica. return r.newNotLeaderError(lease) } // Anything happening from now on needs to enter the response cache. defer func() { // TODO(tamird,tschottdorf): according to #1400 we intend to set the reply // header's error as late as possible and in a central location. Range // commands still write to the header directly, but once they don't this // could be the authoritative location that sets the reply error for any- // thing that makes it into Raft. Note that we must set this prior to // signaling cmd.done below, or the waiting RPC handler might proceed // before we've updated its reply. // // It is important that the error is set before the reply is saved into // the response cache. reply.Header().SetGoError(rErr) if proto.IsWrite(args) { // No matter the result, add result to the response cache if this // is a write method. This must be done as part of the execution of // raft commands so that every replica maintains the same responses // to continue request idempotence, even if leadership changes. if err := r.respCache.PutResponse(args.Header().CmdID, reply); err != nil { rErr = newReplicaCorruptionError( util.Errorf("could not put to response cache"), err, rErr) return } } }() header := args.Header() // Check the response cache to ensure idempotency. if proto.IsWrite(args) { if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok && err == nil { if log.V(1) { log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID) } return err } else if ok && err != nil { return newReplicaCorruptionError( util.Errorf("could not read from response cache"), err) } } // Create a new batch for the command to ensure all or nothing semantics. batch := r.rm.Engine().NewBatch() defer batch.Close() // Create a engine.MVCCStats instance. ms := engine.MVCCStats{} // Execute the command; the error will also be set in the reply header. // TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not // touch the reply header's error field. intents, err := r.executeCmd(batch, &ms, args, reply) // If the execution of the command wasn't successful, stop here. if err != nil { return err } if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return newReplicaCorruptionError( util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Advance the applied index atomically within the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { return newReplicaCorruptionError( util.Errorf("could not update applied index"), err) } if proto.IsWrite(args) { // On success, flush the MVCC stats to the batch and commit. if err := r.stats.MergeMVCCStats(batch, &ms, header.Timestamp.WallTime); err != nil { return newReplicaCorruptionError(util.Errorf("could not merge MVCC stats"), err) } if err := batch.Commit(); err != nil { return newReplicaCorruptionError(util.Errorf("could not commit batch"), err) } committed = true // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // After successful commit, update cached stats and appliedIndex value. atomic.StoreUint64(&r.appliedIndex, index) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs on a put. switch args.(type) { case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest: if header.Key.Less(keys.SystemMax) { // We hold the lock already. r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool { return bytes.HasPrefix(header.Key, configPrefix) }) } } } // On success and only on the replica on which this command originated, // resolve skipped intents asynchronously. if originNode == r.rm.RaftNodeID() { r.handleSkippedIntents(args, intents) } return nil }
// applyRaftCommandInBatch executes the command in a batch engine and // returns the batch containing the results. The caller is responsible // for committing the batch, even on error. func (r *Range) applyRaftCommandInBatch(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, ms *engine.MVCCStats) (engine.Engine, proto.Response, error) { // Create a new batch for the command to ensure all or nothing semantics. batch := r.rm.Engine().NewBatch() if lease := r.getLease(); args.Method() != proto.InternalLeaderLease && (!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) { // Verify the leader lease is held, unless this command is trying to // obtain it. Any other Raft command has had the leader lease held // by the replica at proposal time, but this may no longer be the case. // Corruption aside, the most likely reason is a leadership change (the // most recent leader assumes responsibility for all past timestamps as // well). In that case, it's not valid to go ahead with the execution: // Writes must be aware of the last time the mutated key was read, and // since reads are served locally by the lease holder without going // through Raft, a read which was not taken into account may have been // served. Hence, we must retry at the current leader. // // It's crucial that we don't update the response cache for the error // returned below since the request is going to be retried with the // same ClientCmdID and would get the distributed sender stuck in an // infinite loop, retrieving a stale NotLeaderError over and over // again, even when proposing at the correct replica. return batch, nil, r.newNotLeaderError(lease, originNode) } // Check the response cache to ensure idempotency. if proto.IsWrite(args) { if reply, err := r.respCache.GetResponse(batch, args.Header().CmdID); err != nil { // Any error encountered while fetching the response cache entry means corruption. return batch, reply, newReplicaCorruptionError(util.Errorf("could not read from response cache"), err) } else if reply != nil { if log.V(1) { log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID) } // TODO(tamird): move this into the response cache itself defer func() { reply.Header().Error = nil }() // We successfully read from the response cache, so return whatever error // was present in the cached entry (if any). return batch, reply, reply.Header().GoError() } } // Execute the command. reply, intents, rErr := r.executeCmd(batch, ms, args) // Regardless of error, add result to the response cache if this is // a write method. This must be done as part of the execution of // raft commands so that every replica maintains the same responses // to continue request idempotence, even if leadership changes. if proto.IsWrite(args) { if rErr == nil { // If command was successful, flush the MVCC stats to the batch. if err := r.stats.MergeMVCCStats(batch, ms, args.Header().Timestamp.WallTime); err != nil { log.Fatalc(ctx, "setting mvcc stats in a batch should never fail: %s", err) } } else { // Otherwise, reset the batch to clear out partial execution and // prepare for the failed response cache entry. batch.Close() batch = r.rm.Engine().NewBatch() } // TODO(tamird): move this into the response cache itself if reply == nil { reply = args.CreateReply() } if reply.Header().Error != nil { panic("the world is on fire") } reply.Header().SetGoError(rErr) if err := r.respCache.PutResponse(batch, args.Header().CmdID, reply); err != nil { log.Fatalc(ctx, "putting a response cache entry in a batch should never fail: %s", err) } reply.Header().Error = nil } // If the execution of the command wasn't successful, stop here. if rErr != nil { return batch, reply, rErr } // On success and only on the replica on which this command originated, // resolve skipped intents asynchronously. if originNode == r.rm.RaftNodeID() { r.handleSkippedIntents(args, intents) } return batch, reply, nil }
// sendRPC sends one or more RPCs to replicas from the supplied proto.Replica // slice. First, replicas which have gossiped addresses are corralled (and // rearranged depending on proximity and whether the request needs to go to a // leader) and then sent via rpc.Send, with requirement that one RPC to a // server must succeed. Returns an RPC error if the request could not be sent. // Note that the reply may contain a higher level error and must be checked in // addition to the RPC error. func (ds *DistSender) sendRPC(trace *tracer.Trace, rangeID proto.RangeID, replicas replicaSlice, order rpc.OrderingPolicy, args proto.Request) (proto.Response, error) { if len(replicas) == 0 { // TODO(tschottdorf): this gets in the way of some tests. Consider // refactoring so that gossip is mocked out more easily. Provisional // code. return nil, util.Errorf("%s: replicas set is empty", // args.Method()) } // Build a slice of replica addresses (if gossiped). var addrs []net.Addr replicaMap := map[string]*proto.Replica{} for i := range replicas { addr := replicas[i].NodeDesc.Address addrs = append(addrs, addr) replicaMap[addr.String()] = &replicas[i].Replica } if len(addrs) == 0 { // TODO(tschottdorf): see len(replicas) above. // return nil, noNodeAddrsAvailError{} } // TODO(pmattis): This needs to be tested. If it isn't set we'll // still route the request appropriately by key, but won't receive // RangeNotFoundErrors. args.Header().RangeID = rangeID // Set RPC opts with stipulation that one of N RPCs must succeed. rpcOpts := rpc.Options{ N: 1, Ordering: order, SendNextTimeout: defaultSendNextTimeout, Timeout: defaultRPCTimeout, Trace: trace, } // getArgs clones the arguments on demand for all but the first replica. firstArgs := true getArgs := func(addr net.Addr) gogoproto.Message { var a proto.Request // Use the supplied args proto if this is our first address. if firstArgs { firstArgs = false a = args } else { // Otherwise, copy the args value and set the replica in the header. a = gogoproto.Clone(args).(proto.Request) } if addr != nil { // TODO(tschottdorf): see len(replicas) above. a.Header().Replica = *replicaMap[addr.String()] } return a } // RPCs are sent asynchronously and there is no synchronized access to // the reply object, so we don't pass itself to RPCSend. // Otherwise there maybe a race case: // If the RPC call times out using our original reply object, // we must not use it any more; the rpc call might still return // and just write to it at any time. // args.CreateReply() should be cheaper than gogoproto.Clone which use reflect. getReply := func() gogoproto.Message { return args.CreateReply() } replies, err := ds.rpcSend(rpcOpts, "Node."+args.Method().String(), addrs, getArgs, getReply, ds.gossip.RPCContext) if err != nil { return nil, err } return replies[0].(proto.Response), nil }
// executeCmd switches over the method and multiplexes to execute the // appropriate storage API command. It returns an error and, for some calls // such as inconsistent reads, the intents they skipped. func (r *Range) executeCmd(batch engine.Engine, ms *engine.MVCCStats, args proto.Request, reply proto.Response) ([]proto.Intent, error) { // Verify key is contained within range here to catch any range split // or merge activity. header := args.Header() if err := r.checkCmdHeader(header); err != nil { reply.Header().SetGoError(err) return nil, err } // If a unittest filter was installed, check for an injected error; otherwise, continue. if TestingCommandFilter != nil && TestingCommandFilter(args, reply) { return nil, reply.Header().GoError() } var intents []proto.Intent switch tArgs := args.(type) { case *proto.GetRequest: intents = r.Get(batch, tArgs, reply.(*proto.GetResponse)) case *proto.PutRequest: r.Put(batch, ms, tArgs, reply.(*proto.PutResponse)) case *proto.ConditionalPutRequest: r.ConditionalPut(batch, ms, tArgs, reply.(*proto.ConditionalPutResponse)) case *proto.IncrementRequest: r.Increment(batch, ms, tArgs, reply.(*proto.IncrementResponse)) case *proto.DeleteRequest: r.Delete(batch, ms, tArgs, reply.(*proto.DeleteResponse)) case *proto.DeleteRangeRequest: r.DeleteRange(batch, ms, tArgs, reply.(*proto.DeleteRangeResponse)) case *proto.ScanRequest: intents = r.Scan(batch, tArgs, reply.(*proto.ScanResponse)) case *proto.EndTransactionRequest: r.EndTransaction(batch, ms, tArgs, reply.(*proto.EndTransactionResponse)) case *proto.InternalRangeLookupRequest: intents = r.InternalRangeLookup(batch, tArgs, reply.(*proto.InternalRangeLookupResponse)) case *proto.InternalHeartbeatTxnRequest: r.InternalHeartbeatTxn(batch, ms, tArgs, reply.(*proto.InternalHeartbeatTxnResponse)) case *proto.InternalGCRequest: r.InternalGC(batch, ms, tArgs, reply.(*proto.InternalGCResponse)) case *proto.InternalPushTxnRequest: r.InternalPushTxn(batch, ms, tArgs, reply.(*proto.InternalPushTxnResponse)) case *proto.InternalResolveIntentRequest: r.InternalResolveIntent(batch, ms, tArgs, reply.(*proto.InternalResolveIntentResponse)) case *proto.InternalResolveIntentRangeRequest: r.InternalResolveIntentRange(batch, ms, tArgs, reply.(*proto.InternalResolveIntentRangeResponse)) case *proto.InternalMergeRequest: r.InternalMerge(batch, ms, tArgs, reply.(*proto.InternalMergeResponse)) case *proto.InternalTruncateLogRequest: r.InternalTruncateLog(batch, ms, tArgs, reply.(*proto.InternalTruncateLogResponse)) case *proto.InternalLeaderLeaseRequest: r.InternalLeaderLease(batch, ms, tArgs, reply.(*proto.InternalLeaderLeaseResponse)) default: return nil, util.Errorf("unrecognized command %s", args.Method()) } if log.V(2) { log.Infof("executed %s command %+v: %+v", args.Method(), args, reply) } // Update the node clock with the serviced request. This maintains a // high water mark for all ops serviced, so that received ops // without a timestamp specified are guaranteed one higher than any // op already executed for overlapping keys. r.rm.Clock().Update(header.Timestamp) // Propagate the request timestamp (which may have changed). reply.Header().Timestamp = header.Timestamp err := reply.Header().GoError() // A ReadWithinUncertaintyIntervalError contains the timestamp of the value // that provoked the conflict. However, we forward the timestamp to the // node's time here. The reason is that the caller (which is always // transactional when this error occurs) in our implementation wants to // use this information to extract a timestamp after which reads from // the nodes are causally consistent with the transaction. This allows // the node to be classified as without further uncertain reads for the // remainder of the transaction. // See the comment on proto.Transaction.CertainNodes. if tErr, ok := reply.Header().GoError().(*proto.ReadWithinUncertaintyIntervalError); ok && tErr != nil { // Note that we can use this node's clock (which may be different from // other replicas') because this error attaches the existing timestamp // to the node itself when retrying. tErr.ExistingTimestamp.Forward(r.rm.Clock().Now()) } // Return the error (if any) set in the reply. return intents, err }