// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) error { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } // If we have an out of order index, there's corruption. No sense in trying // to update anything or run the command. Simply return a corruption error. if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Call the helper, which returns a batch containing data written // during command execution and any associated error. ms := engine.MVCCStats{} batch, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, reply, &ms) // ALWAYS set the reply header error to the error returned by the // helper. This is the definitive result of the execution. The // error must be set before saving to the response cache. // TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not // touch the reply header's error field. reply.Header().SetGoError(rErr) defer batch.Close() // Advance the last applied index and commit the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err) } if err := batch.Commit(); err != nil { rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr) } else { // Update cached appliedIndex if we were able to set the applied index on disk. atomic.StoreUint64(&r.appliedIndex, index) } // On successful write commands, flush to event feed, and handle other // write-related triggers including splitting and config gossip updates. if rErr == nil && proto.IsWrite(args) { // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs on a put. switch args.(type) { case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest: if key := args.Header().Key; key.Less(keys.SystemMax) { // We hold the lock already. r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool { return bytes.HasPrefix(key, configPrefix) }) } } } return rErr }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request) (proto.Response, error) { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } // If we have an out of order index, there's corruption. No sense in trying // to update anything or run the command. Simply return a corruption error. if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return nil, newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Call the helper, which returns a batch containing data written // during command execution and any associated error. ms := engine.MVCCStats{} batch, reply, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, &ms) defer batch.Close() // Advance the last applied index and commit the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err) } if err := batch.Commit(); err != nil { rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr) } else { // Update cached appliedIndex if we were able to set the applied index on disk. atomic.StoreUint64(&r.appliedIndex, index) } // On successful write commands, flush to event feed, and handle other // write-related triggers including splitting and config gossip updates. if rErr == nil && proto.IsWrite(args) { // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs if the command is not part of a transaction. // If the command is part of an uncommitted transaction, we rely on the // periodic configGossipInterval loop since we will not see the update // until the transaction is committed. if key := args.Header().Key; key.Less(keys.SystemMax) && args.Header().Txn == nil { r.maybeGossipConfigs(func(configPrefix proto.Key) bool { return bytes.HasPrefix(key, configPrefix) }) } } return reply, rErr }
func (r *Range) maybeGossipConfigsLocked(match func(configPrefix proto.Key) bool) { if r.rm.Gossip() == nil || !r.isInitialized() { return } ctx := r.context() for i, cd := range configDescriptors { if match(cd.keyPrefix) { // Check for a bad range split. This should never happen as ranges // cannot be split mid-config. if !r.ContainsKey(cd.keyPrefix.PrefixEnd()) { // If we ever implement configs that span multiple ranges, // we must update store.startGossip accordingly. For the // time being, it will only fire the first range. log.Fatalc(ctx, "range splits configuration values for %s", cd.keyPrefix) } configMap, hash, err := loadConfigMap(r.rm.Engine(), cd.keyPrefix, cd.configI) if err != nil { log.Errorc(ctx, "failed loading %s config map: %s", cd.gossipKey, err) continue } if r.configHashes == nil { r.configHashes = map[int][]byte{} } if prevHash, ok := r.configHashes[i]; !ok || !bytes.Equal(prevHash, hash) { r.configHashes[i] = hash log.Infoc(ctx, "gossiping %s config from store %d, range %d", cd.gossipKey, r.rm.StoreID(), r.Desc().RaftID) if err := r.rm.Gossip().AddInfo(cd.gossipKey, configMap, 0*time.Second); err != nil { log.Errorc(ctx, "failed to gossip %s configMap: %s", cd.gossipKey, err) continue } } } } }
// processRaftCommand processes a raft command by unpacking the command // struct to get args and reply and then applying the command to the // state machine via applyRaftCommand(). The error result is sent on // the command's done channel, if available. func (r *Range) processRaftCommand(idKey cmdIDKey, index uint64, raftCmd proto.InternalRaftCommand) error { if index == 0 { log.Fatalc(r.context(), "processRaftCommand requires a non-zero index") } r.Lock() cmd := r.pendingCmds[idKey] delete(r.pendingCmds, idKey) r.Unlock() args := raftCmd.Cmd.GetValue().(proto.Request) var reply proto.Response var ctx context.Context if cmd != nil { // We initiated this command, so use the caller-supplied reply. reply = cmd.Reply ctx = cmd.ctx } else { // This command originated elsewhere so we must create a new reply buffer. reply = args.CreateReply() // TODO(tschottdorf): consider the Trace situation here. ctx = r.context() } execDone := tracer.FromCtx(ctx).Epoch(fmt.Sprintf("applying %s", args.Method())) // applyRaftCommand will return "expected" errors, but may also indicate // replica corruption (as of now, signaled by a replicaCorruptionError). // We feed its return through maybeSetCorrupt to act when that happens. err := r.maybeSetCorrupt( r.applyRaftCommand(ctx, index, proto.RaftNodeID(raftCmd.OriginNodeID), args, reply), ) execDone() if cmd != nil { cmd.done <- err } else if err != nil && log.V(1) { log.Errorc(r.context(), "error executing raft command %s: %s", args.Method(), err) } return err }
// proposeRaftCommand prepares necessary pending command struct and // initializes a client command ID if one hasn't been. It then // proposes the command to Raft and returns the error channel and // pending command struct for receiving. func (r *Range) proposeRaftCommand(ctx context.Context, args proto.Request) (<-chan error, *pendingCmd) { pendingCmd := &pendingCmd{ ctx: ctx, done: make(chan responseWithErr, 1), } raftCmd := proto.InternalRaftCommand{ RaftID: r.Desc().RaftID, OriginNodeID: r.rm.RaftNodeID(), } cmdID := args.Header().GetOrCreateCmdID(r.rm.Clock().PhysicalNow()) ok := raftCmd.Cmd.SetValue(args) if !ok { log.Fatalc(ctx, "unknown command type %T", args) } idKey := makeCmdIDKey(cmdID) r.Lock() r.pendingCmds[idKey] = pendingCmd r.Unlock() errChan := r.rm.ProposeRaftCommand(idKey, raftCmd) return errChan, pendingCmd }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) (rErr error) { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } committed := false // The very last thing we do before returning is move the applied index // forward, unless that has already happened as part of a successfully // committed batch. defer func() { if !committed { // We didn't commit the batch, but advance the last applied index nonetheless. if err := setAppliedIndex(r.rm.Engine(), r.Desc().RaftID, index); err != nil { rErr = newReplicaCorruptionError( util.Errorf("could not advance applied index"), err, rErr) return } atomic.StoreUint64(&r.appliedIndex, index) } }() if lease := r.getLease(); args.Method() != proto.InternalLeaderLease && (!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) { // Verify the leader lease is held, unless this command is trying to // obtain it. Any other Raft command has had the leader lease held // by the replica at proposal time, but this may no more be the case. // Corruption aside, the most likely reason is a leadership change (the // most recent leader assumes responsibility for all past timestamps as // well). In that case, it's not valid to go ahead with the execution: // Writes must be aware of the last time the mutated key was read, and // since reads are served locally by the lease holder without going // through Raft, a read which was not taken into account may have been // served. Hence, we must retry at the current leader. // // It's crucial that we don't update the response cache for the error // returned below since the request is going to be retried with the // same ClientCmdID and would get the distributed sender stuck in an // infinite loop, retrieving a stale NotLeaderError over and over // again, even when proposing at the correct replica. return r.newNotLeaderError(lease) } // Anything happening from now on needs to enter the response cache. defer func() { // TODO(tamird,tschottdorf): according to #1400 we intend to set the reply // header's error as late as possible and in a central location. Range // commands still write to the header directly, but once they don't this // could be the authoritative location that sets the reply error for any- // thing that makes it into Raft. Note that we must set this prior to // signaling cmd.done below, or the waiting RPC handler might proceed // before we've updated its reply. // // It is important that the error is set before the reply is saved into // the response cache. reply.Header().SetGoError(rErr) if proto.IsWrite(args) { // No matter the result, add result to the response cache if this // is a write method. This must be done as part of the execution of // raft commands so that every replica maintains the same responses // to continue request idempotence, even if leadership changes. if err := r.respCache.PutResponse(args.Header().CmdID, reply); err != nil { rErr = newReplicaCorruptionError( util.Errorf("could not put to response cache"), err, rErr) return } } }() header := args.Header() // Check the response cache to ensure idempotency. if proto.IsWrite(args) { if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok && err == nil { if log.V(1) { log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID) } return err } else if ok && err != nil { return newReplicaCorruptionError( util.Errorf("could not read from response cache"), err) } } // Create a new batch for the command to ensure all or nothing semantics. batch := r.rm.Engine().NewBatch() defer batch.Close() // Create a engine.MVCCStats instance. ms := engine.MVCCStats{} // Execute the command; the error will also be set in the reply header. // TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not // touch the reply header's error field. intents, err := r.executeCmd(batch, &ms, args, reply) // If the execution of the command wasn't successful, stop here. if err != nil { return err } if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return newReplicaCorruptionError( util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Advance the applied index atomically within the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { return newReplicaCorruptionError( util.Errorf("could not update applied index"), err) } if proto.IsWrite(args) { // On success, flush the MVCC stats to the batch and commit. if err := r.stats.MergeMVCCStats(batch, &ms, header.Timestamp.WallTime); err != nil { return newReplicaCorruptionError(util.Errorf("could not merge MVCC stats"), err) } if err := batch.Commit(); err != nil { return newReplicaCorruptionError(util.Errorf("could not commit batch"), err) } committed = true // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // After successful commit, update cached stats and appliedIndex value. atomic.StoreUint64(&r.appliedIndex, index) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs on a put. switch args.(type) { case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest: if header.Key.Less(keys.SystemMax) { // We hold the lock already. r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool { return bytes.HasPrefix(header.Key, configPrefix) }) } } } // On success and only on the replica on which this command originated, // resolve skipped intents asynchronously. if originNode == r.rm.RaftNodeID() { r.handleSkippedIntents(args, intents) } return nil }
// applyRaftCommandInBatch executes the command in a batch engine and // returns the batch containing the results. The caller is responsible // for committing the batch, even on error. func (r *Range) applyRaftCommandInBatch(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, ms *engine.MVCCStats) (engine.Engine, proto.Response, error) { // Create a new batch for the command to ensure all or nothing semantics. batch := r.rm.Engine().NewBatch() if lease := r.getLease(); args.Method() != proto.InternalLeaderLease && (!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) { // Verify the leader lease is held, unless this command is trying to // obtain it. Any other Raft command has had the leader lease held // by the replica at proposal time, but this may no longer be the case. // Corruption aside, the most likely reason is a leadership change (the // most recent leader assumes responsibility for all past timestamps as // well). In that case, it's not valid to go ahead with the execution: // Writes must be aware of the last time the mutated key was read, and // since reads are served locally by the lease holder without going // through Raft, a read which was not taken into account may have been // served. Hence, we must retry at the current leader. // // It's crucial that we don't update the response cache for the error // returned below since the request is going to be retried with the // same ClientCmdID and would get the distributed sender stuck in an // infinite loop, retrieving a stale NotLeaderError over and over // again, even when proposing at the correct replica. return batch, nil, r.newNotLeaderError(lease, originNode) } // Check the response cache to ensure idempotency. if proto.IsWrite(args) { if reply, err := r.respCache.GetResponse(batch, args.Header().CmdID); err != nil { // Any error encountered while fetching the response cache entry means corruption. return batch, reply, newReplicaCorruptionError(util.Errorf("could not read from response cache"), err) } else if reply != nil { if log.V(1) { log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID) } // TODO(tamird): move this into the response cache itself defer func() { reply.Header().Error = nil }() // We successfully read from the response cache, so return whatever error // was present in the cached entry (if any). return batch, reply, reply.Header().GoError() } } // Execute the command. reply, intents, rErr := r.executeCmd(batch, ms, args) // Regardless of error, add result to the response cache if this is // a write method. This must be done as part of the execution of // raft commands so that every replica maintains the same responses // to continue request idempotence, even if leadership changes. if proto.IsWrite(args) { if rErr == nil { // If command was successful, flush the MVCC stats to the batch. if err := r.stats.MergeMVCCStats(batch, ms, args.Header().Timestamp.WallTime); err != nil { log.Fatalc(ctx, "setting mvcc stats in a batch should never fail: %s", err) } } else { // Otherwise, reset the batch to clear out partial execution and // prepare for the failed response cache entry. batch.Close() batch = r.rm.Engine().NewBatch() } // TODO(tamird): move this into the response cache itself if reply == nil { reply = args.CreateReply() } if reply.Header().Error != nil { panic("the world is on fire") } reply.Header().SetGoError(rErr) if err := r.respCache.PutResponse(batch, args.Header().CmdID, reply); err != nil { log.Fatalc(ctx, "putting a response cache entry in a batch should never fail: %s", err) } reply.Header().Error = nil } // If the execution of the command wasn't successful, stop here. if rErr != nil { return batch, reply, rErr } // On success and only on the replica on which this command originated, // resolve skipped intents asynchronously. if originNode == r.rm.RaftNodeID() { r.handleSkippedIntents(args, intents) } return batch, reply, nil }