// ReadOnlyCmd updates the read timestamp cache and waits for any // overlapping writes currently processing through Raft ahead of us to // clear via the read queue. func (r *Range) ReadOnlyCmd(method string, args proto.Request, reply proto.Response) error { header := args.Header() r.Lock() r.tsCache.Add(header.Key, header.EndKey, header.Timestamp) var wg sync.WaitGroup r.readQ.AddRead(header.Key, header.EndKey, &wg) r.Unlock() wg.Wait() // It's possible that arbitrary delays (e.g. major GC, VM // de-prioritization, etc.) could cause the execution of this read // command to occur AFTER the range replica has lost leadership. // // There is a chance that we waited on writes, and although they // were committed to the log, they weren't successfully applied to // this replica's state machine. We re-verify leadership before // reading to make sure that all pending writes are persisted. // // There are some elaborate cases where we might have lost // leadership and then regained it during the delay, but this is ok // because any writes during that period necessarily had higher // timestamps. This is because the read-timestamp-cache prevents it // for the active leader and leadership changes force the // read-timestamp-cache to reset its high water mark. if !r.IsLeader() { // TODO(spencer): when we happen to know the leader, fill it in here via replica. return &proto.NotLeaderError{} } return r.executeCmd(method, args, reply) }
// ExecuteCmd synchronously runs Store.ExecuteCmd. The store is looked // up from the store map if specified by header.Replica; otherwise, // the command is being executed locally, and the replica is // determined via lookup of header.Key in the ranges slice. func (kv *LocalKV) ExecuteCmd(method string, args proto.Request, replyChan interface{}) { // If the replica isn't specified in the header, look it up. var err error var store *storage.Store // If we aren't given a Replica, then a little bending over // backwards here. We need to find the Store, but all we have is the // Key. So find its Range locally, and pull out its Replica which we // use to find the Store. This lets us use the same codepath below // (store.ExecuteCmd) for both locally and remotely originated // commands. header := args.Header() if header.Replica.NodeID == 0 { if repl := kv.lookupReplica(header.Key); repl != nil { header.Replica = *repl } else { err = util.Errorf("unable to lookup range replica for key %q", string(header.Key)) } } if err == nil { store, err = kv.GetStore(&header.Replica) } reply := reflect.New(reflect.TypeOf(replyChan).Elem().Elem()).Interface().(proto.Response) if err != nil { reply.Header().SetGoError(err) } else { store.ExecuteCmd(method, args, reply) } reflect.ValueOf(replyChan).Send(reflect.ValueOf(reply)) }
// ReadWriteCmd first consults the response cache to determine whether // this command has already been sent to the range. If a response is // found, it's returned immediately and not submitted to raft. Next, // the timestamp cache is checked to determine if any newer accesses to // this command's affected keys have been made. If so, this command's // timestamp is moved forward. Finally the keys affected by this // command are added as pending writes to the read queue and the // command is submitted to Raft. Upon completion, the write is removed // from the read queue and the reply is added to the repsonse cache. func (r *Range) ReadWriteCmd(method string, args proto.Request, reply proto.Response) error { // Check the response cache in case this is a replay. This call // may block if the same command is already underway. header := args.Header() if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok || err != nil { if ok { // this is a replay! extract error for return return reply.Header().GoError() } // In this case there was an error reading from the response // cache. Instead of failing the request just because we can't // decode the reply in the response cache, we proceed as though // idempotence has expired. log.Errorf("unable to read result for %+v from the response cache: %v", args, err) } // One of the prime invariants of Cockroach is that a mutating command // cannot write a key with an earlier timestamp than the most recent // read of the same key. So first order of business here is to check // the timestamp cache for reads/writes which are more recent than the // timestamp of this write. If more recent, we simply update the // write's timestamp before enqueuing it for execution. When the write // returns, the updated timestamp will inform the final commit // timestamp. r.Lock() // Protect access to timestamp cache and read queue. if ts := r.tsCache.GetMax(header.Key, header.EndKey); header.Timestamp.Less(ts) { if glog.V(1) { glog.Infof("Overriding existing timestamp %s with %s", header.Timestamp, ts) } ts.Logical++ // increment logical component by one to differentiate. // Update the request timestamp. header.Timestamp = ts } // Just as for reads, we update the timestamp cache with the // timestamp of this write. This ensures a strictly higher timestamp // for successive writes to the same key or key range. r.tsCache.Add(header.Key, header.EndKey, header.Timestamp) // The next step is to add the write to the read queue to inform // subsequent reads that there is a pending write. Reads which // overlap pending writes must wait for those writes to complete. wKey := r.readQ.AddWrite(header.Key, header.EndKey) r.Unlock() // Create command and enqueue for Raft. cmd := &Cmd{ Method: method, Args: args, Reply: reply, done: make(chan error, 1), } // This waits for the command to complete. err := r.EnqueueCmd(cmd) // Now that the command has completed, remove the pending write. r.Lock() r.readQ.RemoveWrite(wKey) r.Unlock() return err }
// usesTimestampCache returns true if the request affects or is // affected by the timestamp cache. func usesTimestampCache(r proto.Request) bool { m := r.Method() if m < 0 || m >= proto.Method(len(tsCacheMethods)) { return false } return tsCacheMethods[m] }
// MaybeWrap wraps the given argument in a batch, unless it is already one. func maybeWrap(args proto.Request) (*proto.BatchRequest, func(*proto.BatchResponse) proto.Response) { if ba, ok := args.(*proto.BatchRequest); ok { return ba, func(br *proto.BatchResponse) proto.Response { return br } } ba := &proto.BatchRequest{} ba.RequestHeader = *(gogoproto.Clone(args.Header()).(*proto.RequestHeader)) ba.Add(args) return ba, func(br *proto.BatchResponse) proto.Response { var unwrappedReply proto.Response if len(br.Responses) == 0 { unwrappedReply = args.CreateReply() } else { unwrappedReply = br.Responses[0].GetInner() } // The ReplyTxn is propagated from one response to the next request, // and we adopt the mechanism that whenever the Txn changes, it needs // to be set in the reply, for example to ratched up the transaction // timestamp on writes when necessary. // This is internally necessary to sequentially execute the batch, // so it makes some sense to take the burden of updating the Txn // from TxnCoordSender - it will only need to act on retries/aborts // in the future. unwrappedReply.Header().Txn = br.Txn if unwrappedReply.Header().Error == nil { unwrappedReply.Header().Error = br.Error } return unwrappedReply } }
// updateForBatch updates the first argument (the header of a request contained // in a batch) from the second one (the batch header), returning an error when // inconsistencies are found. // It is checked that the individual call does not have a User, UserPriority // or Txn set that differs from the batch's. func updateForBatch(args proto.Request, bHeader proto.RequestHeader) error { // Disallow transaction, user and priority on individual calls, unless // equal. aHeader := args.Header() if aHeader.User != "" && aHeader.User != bHeader.User { return util.Error("conflicting user on call in batch") } if aPrio := aHeader.GetUserPriority(); aPrio != proto.Default_RequestHeader_UserPriority && aPrio != bHeader.GetUserPriority() { return util.Error("conflicting user priority on call in batch") } aHeader.User = bHeader.User aHeader.UserPriority = bHeader.UserPriority // Only allow individual transactions on the requests of a batch if // - the batch is non-transactional, // - the individual transaction does not write intents, and // - the individual transaction is initialized. // The main usage of this is to allow mass-resolution of intents, which // entails sending a non-txn batch of transactional InternalResolveIntent. if aHeader.Txn != nil && !aHeader.Txn.Equal(bHeader.Txn) { if len(aHeader.Txn.ID) == 0 || proto.IsTransactionWrite(args) || bHeader.Txn != nil { return util.Error("conflicting transaction in transactional batch") } } else { aHeader.Txn = bHeader.Txn } return nil }
// ExecuteCmd synchronously runs Store.ExecuteCmd. The store is looked // up from the store map if specified by header.Replica; otherwise, // the command is being executed locally, and the replica is // determined via lookup through each of the stores. func (kv *LocalKV) ExecuteCmd(method string, args proto.Request, replyChan interface{}) { // If the replica isn't specified in the header, look it up. var err error var store *storage.Store // If we aren't given a Replica, then a little bending over // backwards here. We need to find the Store, but all we have is the // Key. So find its Range locally. This lets us use the same // codepath below (store.ExecuteCmd) for both locally and remotely // originated commands. header := args.Header() if header.Replica.StoreID == 0 { var repl *proto.Replica repl, err = kv.lookupReplica(header.Key, header.EndKey) if err == nil { header.Replica = *repl } } if err == nil { store, err = kv.GetStore(header.Replica.StoreID) } reply := reflect.New(reflect.TypeOf(replyChan).Elem().Elem()).Interface().(proto.Response) if err != nil { reply.Header().SetGoError(err) } else { store.ExecuteCmd(method, args, reply) if err := reply.Verify(args); err != nil { reply.Header().SetGoError(err) } } reflect.ValueOf(replyChan).Send(reflect.ValueOf(reply)) }
// executeCmd switches over the method and multiplexes to execute the // appropriate storage API command. func (r *Range) executeCmd(method string, args proto.Request, reply proto.Response) error { switch method { case Contains: r.Contains(args.(*proto.ContainsRequest), reply.(*proto.ContainsResponse)) case Get: r.Get(args.(*proto.GetRequest), reply.(*proto.GetResponse)) case Put: r.Put(args.(*proto.PutRequest), reply.(*proto.PutResponse)) case ConditionalPut: r.ConditionalPut(args.(*proto.ConditionalPutRequest), reply.(*proto.ConditionalPutResponse)) case Increment: r.Increment(args.(*proto.IncrementRequest), reply.(*proto.IncrementResponse)) case Delete: r.Delete(args.(*proto.DeleteRequest), reply.(*proto.DeleteResponse)) case DeleteRange: r.DeleteRange(args.(*proto.DeleteRangeRequest), reply.(*proto.DeleteRangeResponse)) case Scan: r.Scan(args.(*proto.ScanRequest), reply.(*proto.ScanResponse)) case EndTransaction: r.EndTransaction(args.(*proto.EndTransactionRequest), reply.(*proto.EndTransactionResponse)) case AccumulateTS: r.AccumulateTS(args.(*proto.AccumulateTSRequest), reply.(*proto.AccumulateTSResponse)) case ReapQueue: r.ReapQueue(args.(*proto.ReapQueueRequest), reply.(*proto.ReapQueueResponse)) case EnqueueUpdate: r.EnqueueUpdate(args.(*proto.EnqueueUpdateRequest), reply.(*proto.EnqueueUpdateResponse)) case EnqueueMessage: r.EnqueueMessage(args.(*proto.EnqueueMessageRequest), reply.(*proto.EnqueueMessageResponse)) case InternalRangeLookup: r.InternalRangeLookup(args.(*proto.InternalRangeLookupRequest), reply.(*proto.InternalRangeLookupResponse)) case InternalHeartbeatTxn: r.InternalHeartbeatTxn(args.(*proto.InternalHeartbeatTxnRequest), reply.(*proto.InternalHeartbeatTxnResponse)) case InternalPushTxn: r.InternalPushTxn(args.(*proto.InternalPushTxnRequest), reply.(*proto.InternalPushTxnResponse)) case InternalResolveIntent: r.InternalResolveIntent(args.(*proto.InternalResolveIntentRequest), reply.(*proto.InternalResolveIntentResponse)) case InternalSnapshotCopy: r.InternalSnapshotCopy(args.(*proto.InternalSnapshotCopyRequest), reply.(*proto.InternalSnapshotCopyResponse)) default: return util.Errorf("unrecognized command type: %s", method) } // Propagate the request timestamp (which may have changed). reply.Header().Timestamp = args.Header().Timestamp // Add this command's result to the response cache if this is a // read/write method. This must be done as part of the execution of // raft commands so that every replica maintains the same responses // to continue request idempotence when leadership changes. if !IsReadOnly(method) { if putErr := r.respCache.PutResponse(args.Header().CmdID, reply); putErr != nil { log.Errorf("unable to write result of %+v: %+v to the response cache: %v", args, reply, putErr) } } // Return the error (if any) set in the reply. return reply.Header().GoError() }
// executeCmd looks up the store specified by header.Replica, and runs // Store.ExecuteCmd. func (n *Node) executeCmd(method string, args proto.Request, reply proto.Response) error { store, err := n.localKV.GetStore(&args.Header().Replica) if err != nil { return err } store.ExecuteCmd(method, args, reply) return nil }
// endCmd removes a pending command from the command queue. func (r *Range) endCmd(cmdKey interface{}, args proto.Request, err error, readOnly bool) { r.Lock() if err == nil && usesTimestampCache(args) { header := args.Header() r.tsCache.Add(header.Key, header.EndKey, header.Timestamp, header.Txn.GetID(), readOnly) } r.cmdQ.Remove(cmdKey) r.Unlock() }
func (db *testDB) executeCmd(method string, args proto.Request, replyChan interface{}) { reply := reflect.New(reflect.TypeOf(replyChan).Elem().Elem()).Interface().(proto.Response) if rng := db.store.LookupRange(args.Header().Key, args.Header().EndKey); rng != nil { args.Header().Replica = *rng.Meta.GetReplica() db.store.ExecuteCmd(method, args, reply) } else { reply.Header().SetGoError(proto.NewRangeKeyMismatchError(args.Header().Key, args.Header().EndKey, nil)) } reflect.ValueOf(replyChan).Send(reflect.ValueOf(reply)) }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) error { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } // If we have an out of order index, there's corruption. No sense in trying // to update anything or run the command. Simply return a corruption error. if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Call the helper, which returns a batch containing data written // during command execution and any associated error. ms := engine.MVCCStats{} batch, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, reply, &ms) // ALWAYS set the reply header error to the error returned by the // helper. This is the definitive result of the execution. The // error must be set before saving to the response cache. // TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not // touch the reply header's error field. reply.Header().SetGoError(rErr) defer batch.Close() // Advance the last applied index and commit the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err) } if err := batch.Commit(); err != nil { rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr) } else { // Update cached appliedIndex if we were able to set the applied index on disk. atomic.StoreUint64(&r.appliedIndex, index) } // On successful write commands, flush to event feed, and handle other // write-related triggers including splitting and config gossip updates. if rErr == nil && proto.IsWrite(args) { // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs on a put. switch args.(type) { case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest: if key := args.Header().Key; key.Less(keys.SystemMax) { // We hold the lock already. r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool { return bytes.HasPrefix(key, configPrefix) }) } } } return rErr }
// UpdateForBatch updates the first argument (the header of a request contained // in a batch) from the second one (the batch header), returning an error when // inconsistencies are found. // It is checked that the individual call does not have a UserPriority // or Txn set that differs from the batch's. // TODO(tschottdorf): will go with #2143. func updateForBatch(args proto.Request, bHeader proto.RequestHeader) error { // Disallow transaction, user and priority on individual calls, unless // equal. aHeader := args.Header() if aPrio := aHeader.GetUserPriority(); aPrio != proto.Default_RequestHeader_UserPriority && aPrio != bHeader.GetUserPriority() { return util.Errorf("conflicting user priority on call in batch") } aHeader.UserPriority = bHeader.UserPriority aHeader.Txn = bHeader.Txn // reqs always take Txn from batch return nil }
// addReadOnlyCmd updates the read timestamp cache and waits for any // overlapping writes currently processing through Raft ahead of us to // clear via the read queue. func (r *Range) addReadOnlyCmd(ctx context.Context, args proto.Request, reply proto.Response) error { header := args.Header() if err := r.checkCmdHeader(header); err != nil { reply.Header().SetGoError(err) return err } // If read-consistency is set to INCONSISTENT, run directly. if header.ReadConsistency == proto.INCONSISTENT { // But disallow any inconsistent reads within txns. if header.Txn != nil { reply.Header().SetGoError(util.Error("cannot allow inconsistent reads within a transaction")) return reply.Header().GoError() } if header.Timestamp.Equal(proto.ZeroTimestamp) { header.Timestamp = r.rm.Clock().Now() } intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply) if err == nil { r.handleSkippedIntents(args, intents) } return err } else if header.ReadConsistency == proto.CONSENSUS { reply.Header().SetGoError(util.Error("consensus reads not implemented")) return reply.Header().GoError() } // Add the read to the command queue to gate subsequent // overlapping commands until this command completes. cmdKey := r.beginCmd(header, true) // This replica must have leader lease to process a consistent read. if err := r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), header.Timestamp); err != nil { r.endCmd(cmdKey, args, err, true /* readOnly */) reply.Header().SetGoError(err) return err } // Execute read-only command. intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply) // Only update the timestamp cache if the command succeeded. r.endCmd(cmdKey, args, err, true /* readOnly */) if err == nil { r.handleSkippedIntents(args, intents) } return err }
// sendAttempt is invoked by Send. It temporarily truncates the arguments to // match the descriptor's EndKey (if necessary) and gathers and rearranges the // replicas before making a single attempt at sending the request. It returns // the result of sending the RPC; a potential error contained in the reply has // to be handled separately by the caller. func (ds *DistSender) sendAttempt(trace *tracer.Trace, args proto.Request, reply proto.Response, desc *proto.RangeDescriptor) error { defer trace.Epoch("sending RPC")() // Truncate the request to our current range, making sure not to // touch it unless we have to (it is illegal to send EndKey on // commands which do not operate on ranges). if endKey := args.Header().EndKey; endKey != nil && !endKey.Less(desc.EndKey) { defer func(k proto.Key) { args.Header().EndKey = k }(endKey) args.Header().EndKey = desc.EndKey } leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } return ds.sendRPC(trace, desc.RaftID, replicas, order, args, reply) }
// CallComplete is called by a node whenever it completes a request. This will // publish an appropriate event to the feed based on the results of the call. func (nef NodeEventFeed) CallComplete(args proto.Request, reply proto.Response) { if err := reply.Header().Error; err != nil && err.CanRestartTransaction() == proto.TransactionRestart_ABORT { nef.f.Publish(&CallErrorEvent{ NodeID: nef.id, Method: args.Method(), }) } else { nef.f.Publish(&CallSuccessEvent{ NodeID: nef.id, Method: args.Method(), }) } }
// addAdminCmd executes the command directly. There is no interaction // with the command queue or the timestamp cache, as admin commands // are not meant to consistently access or modify the underlying data. // Admin commands must run on the leader replica. func (r *Range) addAdminCmd(ctx context.Context, args proto.Request, reply proto.Response) error { // Admin commands always require the leader lease. if err := r.redirectOnOrAcquireLeaderLease(args.Header().Timestamp); err != nil { reply.Header().SetGoError(err) return err } switch args.(type) { case *proto.AdminSplitRequest: r.AdminSplit(args.(*proto.AdminSplitRequest), reply.(*proto.AdminSplitResponse)) case *proto.AdminMergeRequest: r.AdminMerge(args.(*proto.AdminMergeRequest), reply.(*proto.AdminMergeResponse)) default: return util.Error("unrecognized admin command") } return reply.Header().GoError() }
// executeCmd creates a proto.Call struct and sends it via our local sender. func (n *nodeServer) executeCmd(args proto.Request, reply proto.Response) error { // TODO(tschottdorf) get a hold of the client's ID, add it to the // context before dispatching, and create an ID for tracing the request. header := args.Header() header.CmdID = header.GetOrCreateCmdID(n.ctx.Clock.PhysicalNow()) trace := n.ctx.Tracer.NewTrace(header) defer trace.Finalize() defer trace.Epoch("node")() ctx := tracer.ToCtx((*Node)(n).context(), trace) n.lSender.Send(ctx, proto.Call{Args: args, Reply: reply}) n.feed.CallComplete(args, reply) if err := reply.Header().GoError(); err != nil { trace.Event(fmt.Sprintf("error: %T", err)) } return nil }
// CallComplete is called by a node whenever it completes a request. This will // publish an appropriate event to the feed based on the results of the call. // TODO(tschottdorf): move to batch, account for multiple methods per batch. // In particular, on error want an error position to identify the failed // request. func (nef NodeEventFeed) CallComplete(args proto.Request, reply proto.Response) { method := args.Method() if ba, ok := args.(*proto.BatchRequest); ok && len(ba.Requests) > 0 { method = ba.Requests[0].GetInner().Method() } if err := reply.Header().Error; err != nil && err.TransactionRestart == proto.TransactionRestart_ABORT { nef.f.Publish(&CallErrorEvent{ NodeID: nef.id, Method: method, }) } else { nef.f.Publish(&CallSuccessEvent{ NodeID: nef.id, Method: method, }) } }
// ExecuteCmd fetches a range based on the header's replica, assembles // method, args & reply into a Raft Cmd struct and executes the // command using the fetched range. func (s *Store) ExecuteCmd(method string, args proto.Request, reply proto.Response) error { // If the request has a zero timestamp, initialize to this node's clock. header := args.Header() if header.Timestamp.WallTime == 0 && header.Timestamp.Logical == 0 { // Update both incoming and outgoing timestamps. now := s.clock.Now() args.Header().Timestamp = now reply.Header().Timestamp = now } else { // Otherwise, update our clock with the incoming request. This // advances the local node's clock to a high water mark from // amongst all nodes with which it has interacted. The update is // bounded by the max clock drift. _, err := s.clock.Update(header.Timestamp) if err != nil { return err } } // Verify specified range contains the command's implicated keys. rng, err := s.GetRange(header.Replica.RangeID) if err != nil { return err } if !rng.ContainsKeyRange(header.Key, header.EndKey) { return proto.NewRangeKeyMismatchError(header.Key, header.EndKey, rng.Meta) } if !rng.IsLeader() { // TODO(spencer): when we happen to know the leader, fill it in here via replica. return &proto.NotLeaderError{} } // Differentiate between read-only and read-write. if IsReadOnly(method) { return rng.ReadOnlyCmd(method, args, reply) } return rng.ReadWriteCmd(method, args, reply) }
// proposeRaftCommand prepares necessary pending command struct and // initializes a client command ID if one hasn't been. It then // proposes the command to Raft and returns the error channel and // pending command struct for receiving. func (r *Range) proposeRaftCommand(ctx context.Context, args proto.Request) (<-chan error, *pendingCmd) { pendingCmd := &pendingCmd{ ctx: ctx, done: make(chan responseWithErr, 1), } raftCmd := proto.InternalRaftCommand{ RaftID: r.Desc().RaftID, OriginNodeID: r.rm.RaftNodeID(), } cmdID := args.Header().GetOrCreateCmdID(r.rm.Clock().PhysicalNow()) ok := raftCmd.Cmd.SetValue(args) if !ok { log.Fatalc(ctx, "unknown command type %T", args) } idKey := makeCmdIDKey(cmdID) r.Lock() r.pendingCmds[idKey] = pendingCmd r.Unlock() errChan := r.rm.ProposeRaftCommand(idKey, raftCmd) return errChan, pendingCmd }
// addAdminCmd executes the command directly. There is no interaction // with the command queue or the timestamp cache, as admin commands // are not meant to consistently access or modify the underlying data. // Admin commands must run on the leader replica. func (r *Range) addAdminCmd(ctx context.Context, args proto.Request) (proto.Response, error) { header := args.Header() if err := r.checkCmdHeader(header); err != nil { return nil, err } // Admin commands always require the leader lease. if err := r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), header.Timestamp); err != nil { return nil, err } switch tArgs := args.(type) { case *proto.AdminSplitRequest: resp, err := r.AdminSplit(tArgs) return &resp, err case *proto.AdminMergeRequest: resp, err := r.AdminMerge(tArgs) return &resp, err default: return nil, util.Error("unrecognized admin command") } }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request) (proto.Response, error) { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } // If we have an out of order index, there's corruption. No sense in trying // to update anything or run the command. Simply return a corruption error. if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return nil, newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Call the helper, which returns a batch containing data written // during command execution and any associated error. ms := engine.MVCCStats{} batch, reply, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, &ms) defer batch.Close() // Advance the last applied index and commit the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err) } if err := batch.Commit(); err != nil { rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr) } else { // Update cached appliedIndex if we were able to set the applied index on disk. atomic.StoreUint64(&r.appliedIndex, index) } // On successful write commands, flush to event feed, and handle other // write-related triggers including splitting and config gossip updates. if rErr == nil && proto.IsWrite(args) { // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs if the command is not part of a transaction. // If the command is part of an uncommitted transaction, we rely on the // periodic configGossipInterval loop since we will not see the update // until the transaction is committed. if key := args.Header().Key; key.Less(keys.SystemMax) && args.Header().Txn == nil { r.maybeGossipConfigs(func(configPrefix proto.Key) bool { return bytes.HasPrefix(key, configPrefix) }) } } return reply, rErr }
// Call invokes the KV command synchronously and returns the response // and error, if applicable. If preceeding calls have been made to // Prepare() without a call to Flush(), this call is prepared and // then all prepared calls are flushed. func (kv *KV) Call(method string, args proto.Request, reply proto.Response) error { if len(kv.prepared) > 0 { kv.Prepare(method, args, reply) return kv.Flush() } if args.Header().User == "" { args.Header().User = kv.User } if args.Header().UserPriority == nil && kv.UserPriority != 0 { args.Header().UserPriority = gogoproto.Int32(kv.UserPriority) } call := &Call{ Method: method, Args: args, Reply: reply, } call.resetClientCmdID(kv.clock) kv.sender.Send(call) err := call.Reply.Header().GoError() if err != nil { log.Infof("failed %s: %s", call.Method, err) } return err }
// ExecuteCmd verifies permissions and looks up the appropriate range // based on the supplied key and sends the RPC according to the // specified options. executeRPC sends asynchronously and returns a // response value on the replyChan channel when the call is complete. func (kv *DistKV) ExecuteCmd(method string, args proto.Request, replyChan interface{}) { // Augment method with "Node." prefix. method = "Node." + method // Verify permissions. if err := kv.verifyPermissions(method, args.Header()); err != nil { sendErrorReply(err, replyChan) return } // Retry logic for lookup of range by key and RPCs to range replicas. retryOpts := util.RetryOptions{ Tag: fmt.Sprintf("routing %s rpc", method), Backoff: retryBackoff, MaxBackoff: maxRetryBackoff, Constant: 2, MaxAttempts: 0, // retry indefinitely } err := util.RetryWithBackoff(retryOpts, func() (bool, error) { desc, err := kv.rangeCache.LookupRangeMetadata(args.Header().Key) if err == nil { err = kv.sendRPC(desc, method, args, replyChan) } if err != nil { // Range metadata might be out of date - evict it. kv.rangeCache.EvictCachedRangeMetadata(args.Header().Key) // If retryable, allow outer loop to retry. if retryErr, ok := err.(util.Retryable); ok && retryErr.CanRetry() { log.Warningf("failed to invoke %s: %v", method, err) return false, nil } } return true, err }) if err != nil { sendErrorReply(err, replyChan) } }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) (rErr error) { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } committed := false // The very last thing we do before returning is move the applied index // forward, unless that has already happened as part of a successfully // committed batch. defer func() { if !committed { // We didn't commit the batch, but advance the last applied index nonetheless. if err := setAppliedIndex(r.rm.Engine(), r.Desc().RaftID, index); err != nil { rErr = newReplicaCorruptionError( util.Errorf("could not advance applied index"), err, rErr) return } atomic.StoreUint64(&r.appliedIndex, index) } }() if lease := r.getLease(); args.Method() != proto.InternalLeaderLease && (!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) { // Verify the leader lease is held, unless this command is trying to // obtain it. Any other Raft command has had the leader lease held // by the replica at proposal time, but this may no more be the case. // Corruption aside, the most likely reason is a leadership change (the // most recent leader assumes responsibility for all past timestamps as // well). In that case, it's not valid to go ahead with the execution: // Writes must be aware of the last time the mutated key was read, and // since reads are served locally by the lease holder without going // through Raft, a read which was not taken into account may have been // served. Hence, we must retry at the current leader. // // It's crucial that we don't update the response cache for the error // returned below since the request is going to be retried with the // same ClientCmdID and would get the distributed sender stuck in an // infinite loop, retrieving a stale NotLeaderError over and over // again, even when proposing at the correct replica. return r.newNotLeaderError(lease) } // Anything happening from now on needs to enter the response cache. defer func() { // TODO(tamird,tschottdorf): according to #1400 we intend to set the reply // header's error as late as possible and in a central location. Range // commands still write to the header directly, but once they don't this // could be the authoritative location that sets the reply error for any- // thing that makes it into Raft. Note that we must set this prior to // signaling cmd.done below, or the waiting RPC handler might proceed // before we've updated its reply. // // It is important that the error is set before the reply is saved into // the response cache. reply.Header().SetGoError(rErr) if proto.IsWrite(args) { // No matter the result, add result to the response cache if this // is a write method. This must be done as part of the execution of // raft commands so that every replica maintains the same responses // to continue request idempotence, even if leadership changes. if err := r.respCache.PutResponse(args.Header().CmdID, reply); err != nil { rErr = newReplicaCorruptionError( util.Errorf("could not put to response cache"), err, rErr) return } } }() header := args.Header() // Check the response cache to ensure idempotency. if proto.IsWrite(args) { if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok && err == nil { if log.V(1) { log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID) } return err } else if ok && err != nil { return newReplicaCorruptionError( util.Errorf("could not read from response cache"), err) } } // Create a new batch for the command to ensure all or nothing semantics. batch := r.rm.Engine().NewBatch() defer batch.Close() // Create a engine.MVCCStats instance. ms := engine.MVCCStats{} // Execute the command; the error will also be set in the reply header. // TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not // touch the reply header's error field. intents, err := r.executeCmd(batch, &ms, args, reply) // If the execution of the command wasn't successful, stop here. if err != nil { return err } if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return newReplicaCorruptionError( util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Advance the applied index atomically within the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { return newReplicaCorruptionError( util.Errorf("could not update applied index"), err) } if proto.IsWrite(args) { // On success, flush the MVCC stats to the batch and commit. if err := r.stats.MergeMVCCStats(batch, &ms, header.Timestamp.WallTime); err != nil { return newReplicaCorruptionError(util.Errorf("could not merge MVCC stats"), err) } if err := batch.Commit(); err != nil { return newReplicaCorruptionError(util.Errorf("could not commit batch"), err) } committed = true // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // After successful commit, update cached stats and appliedIndex value. atomic.StoreUint64(&r.appliedIndex, index) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs on a put. switch args.(type) { case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest: if header.Key.Less(keys.SystemMax) { // We hold the lock already. r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool { return bytes.HasPrefix(header.Key, configPrefix) }) } } } // On success and only on the replica on which this command originated, // resolve skipped intents asynchronously. if originNode == r.rm.RaftNodeID() { r.handleSkippedIntents(args, intents) } return nil }
// addWriteCmd first consults the response cache to determine whether // this command has already been sent to the range. If a response is // found, it's returned immediately and not submitted to raft. Next, // the timestamp cache is checked to determine if any newer accesses to // this command's affected keys have been made. If so, this command's // timestamp is moved forward. Finally the keys affected by this // command are added as pending writes to the read queue and the // command is submitted to Raft. Upon completion, the write is removed // from the read queue and the reply is added to the response cache. // If wait is true, will block until the command is complete. func (r *Range) addWriteCmd(ctx context.Context, args proto.Request, reply proto.Response, wait bool) error { // Check the response cache in case this is a replay. This call // may block if the same command is already underway. header := args.Header() // Add the write to the command queue to gate subsequent overlapping // Commands until this command completes. Note that this must be // done before getting the max timestamp for the key(s), as // timestamp cache is only updated after preceding commands have // been run to successful completion. cmdKey := r.beginCmd(header, false) // This replica must have leader lease to process a write. if err := r.redirectOnOrAcquireLeaderLease(header.Timestamp); err != nil { r.endCmd(cmdKey, args, err, false /* !readOnly */) reply.Header().SetGoError(err) return err } // Two important invariants of Cockroach: 1) encountering a more // recently written value means transaction restart. 2) values must // be written with a greater timestamp than the most recent read to // the same key. Check the timestamp cache for reads/writes which // are at least as recent as the timestamp of this write. For // writes, send WriteTooOldError; for reads, update the write's // timestamp. When the write returns, the updated timestamp will // inform the final commit timestamp. if usesTimestampCache(args) { r.Lock() rTS, wTS := r.tsCache.GetMax(header.Key, header.EndKey, header.Txn.GetID()) r.Unlock() // Always push the timestamp forward if there's been a read which // occurred after our txn timestamp. if !rTS.Less(header.Timestamp) { header.Timestamp = rTS.Next() } // If there's a newer write timestamp... if !wTS.Less(header.Timestamp) { // If we're in a txn, set a write too old error in reply. We // still go ahead and try the write because we want to avoid // restarting the transaction in the event that there isn't an // intent or the intent can be pushed by us. if header.Txn != nil { err := &proto.WriteTooOldError{Timestamp: header.Timestamp, ExistingTimestamp: wTS} reply.Header().SetGoError(err) } else { // Otherwise, make sure we advance the request's timestamp. header.Timestamp = wTS.Next() } } } errChan, pendingCmd := r.proposeRaftCommand(ctx, args, reply) // Create a completion func for mandatory cleanups which we either // run synchronously if we're waiting or in a goroutine otherwise. completionFunc := func() error { // First wait for raft to commit or abort the command. var err error if err = <-errChan; err == nil { // Next if the command was committed, wait for the range to apply it. err = <-pendingCmd.done } else if err == multiraft.ErrGroupDeleted { // This error needs to be converted appropriately so that // clients will retry. err = proto.NewRangeNotFoundError(r.Desc().RaftID) } // As for reads, update timestamp cache with the timestamp // of this write on success. This ensures a strictly higher // timestamp for successive writes to the same key or key range. r.endCmd(cmdKey, args, err, false /* !readOnly */) return err } if wait { return completionFunc() } go func() { // If the original client didn't wait (e.g. resolve write intent), // log execution errors so they're surfaced somewhere. if err := completionFunc(); err != nil { // TODO(tschottdorf): possible security risk to log args. log.Warningc(ctx, "async execution of %v failed: %s", args, err) } }() return nil }
// verifyPermissions verifies that the requesting user (header.User) // has permission to read/write (capabilities depend on method // name). In the event that multiple permission configs apply to the // key range implicated by the command, the lowest common denominator // for permission. For example, if a scan crosses two permission // configs, both configs must allow read permissions or the entire // scan will fail. func (ds *DistSender) verifyPermissions(args proto.Request) error { // The root user can always proceed. header := args.Header() if header.User == storage.UserRoot { return nil } // Check for admin methods. if proto.IsAdmin(args) { if header.User != storage.UserRoot { return util.Errorf("user %q cannot invoke admin command %s", header.User, args.Method()) } return nil } // Get permissions map from gossip. configMap, err := ds.gossip.GetInfo(gossip.KeyConfigPermission) if err != nil { return util.Errorf("permissions not available via gossip") } if configMap == nil { return util.Errorf("perm configs not available; cannot execute %s", args.Method()) } permMap := configMap.(storage.PrefixConfigMap) headerEnd := header.EndKey if len(headerEnd) == 0 { headerEnd = header.Key } // Visit PermConfig(s) which apply to the method's key range. // - For each perm config which the range covers, verify read or writes // are allowed as method requires. // - Verify the permissions hierarchically; that is, if permissions aren't // granted at the longest prefix, try next longest, then next, etc., up // to and including the default prefix. // // TODO(spencer): it might make sense to visit prefixes from the // shortest to longest instead for performance. Keep an eye on profiling // for this code path as permission sets grow large. return permMap.VisitPrefixes(header.Key, headerEnd, func(start, end proto.Key, config interface{}) (bool, error) { hasPerm := false if err := permMap.VisitPrefixesHierarchically(start, func(start, end proto.Key, config interface{}) (bool, error) { perm := config.(*proto.PermConfig) if proto.IsRead(args) && !perm.CanRead(header.User) { return false, nil } if proto.IsWrite(args) && !perm.CanWrite(header.User) { return false, nil } // Return done = true, as permissions have been granted by this config. hasPerm = true return true, nil }); err != nil { return false, err } if !hasPerm { if len(header.EndKey) == 0 { return false, util.Errorf("user %q cannot invoke %s at %q", header.User, args.Method(), start) } return false, util.Errorf("user %q cannot invoke %s at %q-%q", header.User, args.Method(), start, end) } return false, nil }) }
// sendRPC sends one or more RPCs to replicas from the supplied proto.Replica // slice. First, replicas which have gossiped addresses are corralled (and // rearranged depending on proximity and whether the request needs to go to a // leader) and then sent via rpc.Send, with requirement that one RPC to a // server must succeed. Returns an RPC error if the request could not be sent. // Note that the reply may contain a higher level error and must be checked in // addition to the RPC error. func (ds *DistSender) sendRPC(raftID proto.RaftID, replicas replicaSlice, order rpc.OrderingPolicy, args proto.Request, reply proto.Response) error { if len(replicas) == 0 { return util.Errorf("%s: replicas set is empty", args.Method()) } // Build a slice of replica addresses (if gossiped). var addrs []net.Addr replicaMap := map[string]*proto.Replica{} for i := range replicas { nd := &replicas[i].NodeDesc addr := util.MakeUnresolvedAddr(nd.Address.Network, nd.Address.Address) addrs = append(addrs, addr) replicaMap[addr.String()] = &replicas[i].Replica } if len(addrs) == 0 { return noNodeAddrsAvailError{} } // TODO(pmattis): This needs to be tested. If it isn't set we'll // still route the request appropriately by key, but won't receive // RangeNotFoundErrors. args.Header().RaftID = raftID // Set RPC opts with stipulation that one of N RPCs must succeed. rpcOpts := rpc.Options{ N: 1, Ordering: order, SendNextTimeout: defaultSendNextTimeout, Timeout: defaultRPCTimeout, } // getArgs clones the arguments on demand for all but the first replica. firstArgs := true getArgs := func(addr net.Addr) interface{} { var a proto.Request // Use the supplied args proto if this is our first address. if firstArgs { firstArgs = false a = args } else { // Otherwise, copy the args value and set the replica in the header. a = gogoproto.Clone(args).(proto.Request) } a.Header().Replica = *replicaMap[addr.String()] return a } // RPCs are sent asynchronously and there is no synchronized access to // the reply object, so we don't pass itself to rpcSend. // Otherwise there maybe a race case: // If the RPC call times out using our original reply object, // we must not use it any more; the rpc call might still return // and just write to it at any time. // args.CreateReply() should be cheaper than gogoproto.Clone which use reflect. getReply := func() interface{} { return args.CreateReply() } replies, err := ds.rpcSend(rpcOpts, "Node."+args.Method().String(), addrs, getArgs, getReply, ds.gossip.RPCContext) if err == nil { // Set content of replies[0] back to reply dst := reflect.ValueOf(reply).Elem() dst.Set(reflect.ValueOf(replies[0]).Elem()) } return err }
// resetClientCmdID sets the client command ID if the call is for a // read-write method. The client command ID provides idempotency // protection in conjunction with the server. func resetClientCmdID(args proto.Request) { args.Header().CmdID = proto.ClientCmdID{ WallTime: time.Now().UnixNano(), Random: rand.Int63(), } }