Exemplo n.º 1
0
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) error {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")
	}

	// If we have an out of order index, there's corruption. No sense in trying
	// to update anything or run the command. Simply return a corruption error.
	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))
	}

	// Call the helper, which returns a batch containing data written
	// during command execution and any associated error.
	ms := engine.MVCCStats{}
	batch, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, reply, &ms)
	// ALWAYS set the reply header error to the error returned by the
	// helper. This is the definitive result of the execution. The
	// error must be set before saving to the response cache.
	// TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not
	// touch the reply header's error field.
	reply.Header().SetGoError(rErr)
	defer batch.Close()

	// Advance the last applied index and commit the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err)
	}
	if err := batch.Commit(); err != nil {
		rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr)
	} else {
		// Update cached appliedIndex if we were able to set the applied index on disk.
		atomic.StoreUint64(&r.appliedIndex, index)
	}

	// On successful write commands, flush to event feed, and handle other
	// write-related triggers including splitting and config gossip updates.
	if rErr == nil && proto.IsWrite(args) {
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// If the commit succeeded, potentially add range to split queue.
		r.maybeAddToSplitQueue()
		// Maybe update gossip configs on a put.
		switch args.(type) {
		case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest:
			if key := args.Header().Key; key.Less(keys.SystemMax) {
				// We hold the lock already.
				r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool {
					return bytes.HasPrefix(key, configPrefix)
				})
			}
		}
	}

	return rErr
}
Exemplo n.º 2
0
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request) (proto.Response, error) {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")
	}

	// If we have an out of order index, there's corruption. No sense in trying
	// to update anything or run the command. Simply return a corruption error.
	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return nil, newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))
	}

	// Call the helper, which returns a batch containing data written
	// during command execution and any associated error.
	ms := engine.MVCCStats{}
	batch, reply, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, &ms)
	defer batch.Close()

	// Advance the last applied index and commit the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err)
	}
	if err := batch.Commit(); err != nil {
		rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr)
	} else {
		// Update cached appliedIndex if we were able to set the applied index on disk.
		atomic.StoreUint64(&r.appliedIndex, index)
	}

	// On successful write commands, flush to event feed, and handle other
	// write-related triggers including splitting and config gossip updates.
	if rErr == nil && proto.IsWrite(args) {
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// If the commit succeeded, potentially add range to split queue.
		r.maybeAddToSplitQueue()
		// Maybe update gossip configs if the command is not part of a transaction.
		// If the command is part of an uncommitted transaction, we rely on the
		// periodic configGossipInterval loop since we will not see the update
		// until the transaction is committed.
		if key := args.Header().Key; key.Less(keys.SystemMax) && args.Header().Txn == nil {
			r.maybeGossipConfigs(func(configPrefix proto.Key) bool {
				return bytes.HasPrefix(key, configPrefix)
			})
		}
	}

	return reply, rErr
}
Exemplo n.º 3
0
func (r *Range) maybeGossipConfigsLocked(match func(configPrefix proto.Key) bool) {
	if r.rm.Gossip() == nil || !r.isInitialized() {
		return
	}
	ctx := r.context()
	for i, cd := range configDescriptors {
		if match(cd.keyPrefix) {
			// Check for a bad range split. This should never happen as ranges
			// cannot be split mid-config.
			if !r.ContainsKey(cd.keyPrefix.PrefixEnd()) {
				// If we ever implement configs that span multiple ranges,
				// we must update store.startGossip accordingly. For the
				// time being, it will only fire the first range.
				log.Fatalc(ctx, "range splits configuration values for %s", cd.keyPrefix)
			}
			configMap, hash, err := loadConfigMap(r.rm.Engine(), cd.keyPrefix, cd.configI)
			if err != nil {
				log.Errorc(ctx, "failed loading %s config map: %s", cd.gossipKey, err)
				continue
			}
			if r.configHashes == nil {
				r.configHashes = map[int][]byte{}
			}
			if prevHash, ok := r.configHashes[i]; !ok || !bytes.Equal(prevHash, hash) {
				r.configHashes[i] = hash
				log.Infoc(ctx, "gossiping %s config from store %d, range %d", cd.gossipKey, r.rm.StoreID(), r.Desc().RaftID)
				if err := r.rm.Gossip().AddInfo(cd.gossipKey, configMap, 0*time.Second); err != nil {
					log.Errorc(ctx, "failed to gossip %s configMap: %s", cd.gossipKey, err)
					continue
				}
			}
		}
	}
}
Exemplo n.º 4
0
// processRaftCommand processes a raft command by unpacking the command
// struct to get args and reply and then applying the command to the
// state machine via applyRaftCommand(). The error result is sent on
// the command's done channel, if available.
func (r *Range) processRaftCommand(idKey cmdIDKey, index uint64, raftCmd proto.InternalRaftCommand) error {
	if index == 0 {
		log.Fatalc(r.context(), "processRaftCommand requires a non-zero index")
	}

	r.Lock()
	cmd := r.pendingCmds[idKey]
	delete(r.pendingCmds, idKey)
	r.Unlock()

	args := raftCmd.Cmd.GetValue().(proto.Request)
	var reply proto.Response
	var ctx context.Context
	if cmd != nil {
		// We initiated this command, so use the caller-supplied reply.
		reply = cmd.Reply
		ctx = cmd.ctx
	} else {
		// This command originated elsewhere so we must create a new reply buffer.
		reply = args.CreateReply()
		// TODO(tschottdorf): consider the Trace situation here.
		ctx = r.context()
	}

	execDone := tracer.FromCtx(ctx).Epoch(fmt.Sprintf("applying %s", args.Method()))
	// applyRaftCommand will return "expected" errors, but may also indicate
	// replica corruption (as of now, signaled by a replicaCorruptionError).
	// We feed its return through maybeSetCorrupt to act when that happens.
	err := r.maybeSetCorrupt(
		r.applyRaftCommand(ctx, index, proto.RaftNodeID(raftCmd.OriginNodeID), args, reply),
	)
	execDone()

	if cmd != nil {
		cmd.done <- err
	} else if err != nil && log.V(1) {
		log.Errorc(r.context(), "error executing raft command %s: %s", args.Method(), err)
	}

	return err
}
Exemplo n.º 5
0
// proposeRaftCommand prepares necessary pending command struct and
// initializes a client command ID if one hasn't been. It then
// proposes the command to Raft and returns the error channel and
// pending command struct for receiving.
func (r *Range) proposeRaftCommand(ctx context.Context, args proto.Request) (<-chan error, *pendingCmd) {
	pendingCmd := &pendingCmd{
		ctx:  ctx,
		done: make(chan responseWithErr, 1),
	}
	raftCmd := proto.InternalRaftCommand{
		RaftID:       r.Desc().RaftID,
		OriginNodeID: r.rm.RaftNodeID(),
	}
	cmdID := args.Header().GetOrCreateCmdID(r.rm.Clock().PhysicalNow())
	ok := raftCmd.Cmd.SetValue(args)
	if !ok {
		log.Fatalc(ctx, "unknown command type %T", args)
	}
	idKey := makeCmdIDKey(cmdID)
	r.Lock()
	r.pendingCmds[idKey] = pendingCmd
	r.Unlock()
	errChan := r.rm.ProposeRaftCommand(idKey, raftCmd)

	return errChan, pendingCmd
}
Exemplo n.º 6
0
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) (rErr error) {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")
	}

	committed := false
	// The very last thing we do before returning is move the applied index
	// forward, unless that has already happened as part of a successfully
	// committed batch.
	defer func() {
		if !committed {
			// We didn't commit the batch, but advance the last applied index nonetheless.
			if err := setAppliedIndex(r.rm.Engine(), r.Desc().RaftID, index); err != nil {
				rErr = newReplicaCorruptionError(
					util.Errorf("could not advance applied index"), err, rErr)
				return
			}
			atomic.StoreUint64(&r.appliedIndex, index)
		}
	}()

	if lease := r.getLease(); args.Method() != proto.InternalLeaderLease &&
		(!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) {
		// Verify the leader lease is held, unless this command is trying to
		// obtain it. Any other Raft command has had the leader lease held
		// by the replica at proposal time, but this may no more be the case.
		// Corruption aside, the most likely reason is a leadership change (the
		// most recent leader assumes responsibility for all past timestamps as
		// well). In that case, it's not valid to go ahead with the execution:
		// Writes must be aware of the last time the mutated key was read, and
		// since reads are served locally by the lease holder without going
		// through Raft, a read which was not taken into account may have been
		// served. Hence, we must retry at the current leader.
		//
		// It's crucial that we don't update the response cache for the error
		// returned below since the request is going to be retried with the
		// same ClientCmdID and would get the distributed sender stuck in an
		// infinite loop, retrieving a stale NotLeaderError over and over
		// again, even when proposing at the correct replica.
		return r.newNotLeaderError(lease)
	}

	// Anything happening from now on needs to enter the response cache.
	defer func() {
		// TODO(tamird,tschottdorf): according to #1400 we intend to set the reply
		// header's error as late as possible and in a central location. Range
		// commands still write to the header directly, but once they don't this
		// could be the authoritative location that sets the reply error for any-
		// thing that makes it into Raft. Note that we must set this prior to
		// signaling cmd.done below, or the waiting RPC handler might proceed
		// before we've updated its reply.
		//
		// It is important that the error is set before the reply is saved into
		// the response cache.
		reply.Header().SetGoError(rErr)

		if proto.IsWrite(args) {
			// No matter the result, add result to the response cache if this
			// is a write method. This must be done as part of the execution of
			// raft commands so that every replica maintains the same responses
			// to continue request idempotence, even if leadership changes.
			if err := r.respCache.PutResponse(args.Header().CmdID, reply); err != nil {
				rErr = newReplicaCorruptionError(
					util.Errorf("could not put to response cache"), err, rErr)
				return
			}
		}
	}()

	header := args.Header()

	// Check the response cache to ensure idempotency.
	if proto.IsWrite(args) {
		if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok && err == nil {
			if log.V(1) {
				log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID)
			}
			return err
		} else if ok && err != nil {
			return newReplicaCorruptionError(
				util.Errorf("could not read from response cache"), err)
		}
	}

	// Create a new batch for the command to ensure all or nothing semantics.
	batch := r.rm.Engine().NewBatch()
	defer batch.Close()

	// Create a engine.MVCCStats instance.
	ms := engine.MVCCStats{}

	// Execute the command; the error will also be set in the reply header.
	// TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not
	// touch the reply header's error field.
	intents, err := r.executeCmd(batch, &ms, args, reply)
	// If the execution of the command wasn't successful, stop here.
	if err != nil {
		return err
	}

	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return newReplicaCorruptionError(
			util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))
	}

	// Advance the applied index atomically within the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		return newReplicaCorruptionError(
			util.Errorf("could not update applied index"), err)
	}

	if proto.IsWrite(args) {
		// On success, flush the MVCC stats to the batch and commit.
		if err := r.stats.MergeMVCCStats(batch, &ms, header.Timestamp.WallTime); err != nil {
			return newReplicaCorruptionError(util.Errorf("could not merge MVCC stats"), err)
		}
		if err := batch.Commit(); err != nil {
			return newReplicaCorruptionError(util.Errorf("could not commit batch"), err)
		}
		committed = true
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// After successful commit, update cached stats and appliedIndex value.
		atomic.StoreUint64(&r.appliedIndex, index)
		// If the commit succeeded, potentially add range to split queue.
		r.maybeAddToSplitQueue()
		// Maybe update gossip configs on a put.
		switch args.(type) {
		case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest:
			if header.Key.Less(keys.SystemMax) {
				// We hold the lock already.
				r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool {
					return bytes.HasPrefix(header.Key, configPrefix)
				})
			}
		}
	}
	// On success and only on the replica on which this command originated,
	// resolve skipped intents asynchronously.
	if originNode == r.rm.RaftNodeID() {
		r.handleSkippedIntents(args, intents)
	}

	return nil
}
Exemplo n.º 7
0
// applyRaftCommandInBatch executes the command in a batch engine and
// returns the batch containing the results. The caller is responsible
// for committing the batch, even on error.
func (r *Range) applyRaftCommandInBatch(ctx context.Context, index uint64, originNode proto.RaftNodeID,
	args proto.Request, ms *engine.MVCCStats) (engine.Engine, proto.Response, error) {
	// Create a new batch for the command to ensure all or nothing semantics.
	batch := r.rm.Engine().NewBatch()

	if lease := r.getLease(); args.Method() != proto.InternalLeaderLease &&
		(!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) {
		// Verify the leader lease is held, unless this command is trying to
		// obtain it. Any other Raft command has had the leader lease held
		// by the replica at proposal time, but this may no longer be the case.
		// Corruption aside, the most likely reason is a leadership change (the
		// most recent leader assumes responsibility for all past timestamps as
		// well). In that case, it's not valid to go ahead with the execution:
		// Writes must be aware of the last time the mutated key was read, and
		// since reads are served locally by the lease holder without going
		// through Raft, a read which was not taken into account may have been
		// served. Hence, we must retry at the current leader.
		//
		// It's crucial that we don't update the response cache for the error
		// returned below since the request is going to be retried with the
		// same ClientCmdID and would get the distributed sender stuck in an
		// infinite loop, retrieving a stale NotLeaderError over and over
		// again, even when proposing at the correct replica.
		return batch, nil, r.newNotLeaderError(lease, originNode)
	}

	// Check the response cache to ensure idempotency.
	if proto.IsWrite(args) {
		if reply, err := r.respCache.GetResponse(batch, args.Header().CmdID); err != nil {
			// Any error encountered while fetching the response cache entry means corruption.
			return batch, reply, newReplicaCorruptionError(util.Errorf("could not read from response cache"), err)
		} else if reply != nil {
			if log.V(1) {
				log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID)
			}
			// TODO(tamird): move this into the response cache itself
			defer func() { reply.Header().Error = nil }()
			// We successfully read from the response cache, so return whatever error
			// was present in the cached entry (if any).
			return batch, reply, reply.Header().GoError()
		}
	}

	// Execute the command.
	reply, intents, rErr := r.executeCmd(batch, ms, args)
	// Regardless of error, add result to the response cache if this is
	// a write method. This must be done as part of the execution of
	// raft commands so that every replica maintains the same responses
	// to continue request idempotence, even if leadership changes.
	if proto.IsWrite(args) {
		if rErr == nil {
			// If command was successful, flush the MVCC stats to the batch.
			if err := r.stats.MergeMVCCStats(batch, ms, args.Header().Timestamp.WallTime); err != nil {
				log.Fatalc(ctx, "setting mvcc stats in a batch should never fail: %s", err)
			}
		} else {
			// Otherwise, reset the batch to clear out partial execution and
			// prepare for the failed response cache entry.
			batch.Close()
			batch = r.rm.Engine().NewBatch()
		}
		// TODO(tamird): move this into the response cache itself
		if reply == nil {
			reply = args.CreateReply()
		}
		if reply.Header().Error != nil {
			panic("the world is on fire")
		}
		reply.Header().SetGoError(rErr)
		if err := r.respCache.PutResponse(batch, args.Header().CmdID, reply); err != nil {
			log.Fatalc(ctx, "putting a response cache entry in a batch should never fail: %s", err)
		}
		reply.Header().Error = nil
	}

	// If the execution of the command wasn't successful, stop here.
	if rErr != nil {
		return batch, reply, rErr
	}

	// On success and only on the replica on which this command originated,
	// resolve skipped intents asynchronously.
	if originNode == r.rm.RaftNodeID() {
		r.handleSkippedIntents(args, intents)
	}

	return batch, reply, nil
}