Пример #1
0
// usesTimestampCache returns true if the request affects or is
// affected by the timestamp cache.
func usesTimestampCache(r proto.Request) bool {
	m := r.Method()
	if m < 0 || m >= proto.Method(len(tsCacheMethods)) {
		return false
	}
	return tsCacheMethods[m]
}
Пример #2
0
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) error {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")
	}

	// If we have an out of order index, there's corruption. No sense in trying
	// to update anything or run the command. Simply return a corruption error.
	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))
	}

	// Call the helper, which returns a batch containing data written
	// during command execution and any associated error.
	ms := engine.MVCCStats{}
	batch, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, reply, &ms)
	// ALWAYS set the reply header error to the error returned by the
	// helper. This is the definitive result of the execution. The
	// error must be set before saving to the response cache.
	// TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not
	// touch the reply header's error field.
	reply.Header().SetGoError(rErr)
	defer batch.Close()

	// Advance the last applied index and commit the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err)
	}
	if err := batch.Commit(); err != nil {
		rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr)
	} else {
		// Update cached appliedIndex if we were able to set the applied index on disk.
		atomic.StoreUint64(&r.appliedIndex, index)
	}

	// On successful write commands, flush to event feed, and handle other
	// write-related triggers including splitting and config gossip updates.
	if rErr == nil && proto.IsWrite(args) {
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// If the commit succeeded, potentially add range to split queue.
		r.maybeAddToSplitQueue()
		// Maybe update gossip configs on a put.
		switch args.(type) {
		case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest:
			if key := args.Header().Key; key.Less(keys.SystemMax) {
				// We hold the lock already.
				r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool {
					return bytes.HasPrefix(key, configPrefix)
				})
			}
		}
	}

	return rErr
}
Пример #3
0
// CallComplete is called by a node whenever it completes a request. This will
// publish an appropriate event to the feed based on the results of the call.
func (nef NodeEventFeed) CallComplete(args proto.Request, reply proto.Response) {
	if err := reply.Header().Error; err != nil &&
		err.CanRestartTransaction() == proto.TransactionRestart_ABORT {
		nef.f.Publish(&CallErrorEvent{
			NodeID: nef.id,
			Method: args.Method(),
		})
	} else {
		nef.f.Publish(&CallSuccessEvent{
			NodeID: nef.id,
			Method: args.Method(),
		})
	}
}
Пример #4
0
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request) (proto.Response, error) {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")
	}

	// If we have an out of order index, there's corruption. No sense in trying
	// to update anything or run the command. Simply return a corruption error.
	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return nil, newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))
	}

	// Call the helper, which returns a batch containing data written
	// during command execution and any associated error.
	ms := engine.MVCCStats{}
	batch, reply, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, &ms)
	defer batch.Close()

	// Advance the last applied index and commit the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err)
	}
	if err := batch.Commit(); err != nil {
		rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr)
	} else {
		// Update cached appliedIndex if we were able to set the applied index on disk.
		atomic.StoreUint64(&r.appliedIndex, index)
	}

	// On successful write commands, flush to event feed, and handle other
	// write-related triggers including splitting and config gossip updates.
	if rErr == nil && proto.IsWrite(args) {
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// If the commit succeeded, potentially add range to split queue.
		r.maybeAddToSplitQueue()
		// Maybe update gossip configs if the command is not part of a transaction.
		// If the command is part of an uncommitted transaction, we rely on the
		// periodic configGossipInterval loop since we will not see the update
		// until the transaction is committed.
		if key := args.Header().Key; key.Less(keys.SystemMax) && args.Header().Txn == nil {
			r.maybeGossipConfigs(func(configPrefix proto.Key) bool {
				return bytes.HasPrefix(key, configPrefix)
			})
		}
	}

	return reply, rErr
}
Пример #5
0
// CallComplete is called by a node whenever it completes a request. This will
// publish an appropriate event to the feed based on the results of the call.
// TODO(tschottdorf): move to batch, account for multiple methods per batch.
// In particular, on error want an error position to identify the failed
// request.
func (nef NodeEventFeed) CallComplete(args proto.Request, reply proto.Response) {
	method := args.Method()
	if ba, ok := args.(*proto.BatchRequest); ok && len(ba.Requests) > 0 {
		method = ba.Requests[0].GetInner().Method()
	}
	if err := reply.Header().Error; err != nil &&
		err.TransactionRestart == proto.TransactionRestart_ABORT {
		nef.f.Publish(&CallErrorEvent{
			NodeID: nef.id,
			Method: method,
		})
	} else {
		nef.f.Publish(&CallSuccessEvent{
			NodeID: nef.id,
			Method: method,
		})
	}
}
Пример #6
0
// sendRPC sends one or more RPCs to replicas from the supplied proto.Replica
// slice. First, replicas which have gossiped addresses are corralled (and
// rearranged depending on proximity and whether the request needs to go to a
// leader) and then sent via rpc.Send, with requirement that one RPC to a
// server must succeed. Returns an RPC error if the request could not be sent.
// Note that the reply may contain a higher level error and must be checked in
// addition to the RPC error.
func (ds *DistSender) sendRPC(raftID proto.RaftID, replicas replicaSlice, order rpc.OrderingPolicy,
	args proto.Request, reply proto.Response) error {
	if len(replicas) == 0 {
		return util.Errorf("%s: replicas set is empty", args.Method())
	}

	// Build a slice of replica addresses (if gossiped).
	var addrs []net.Addr
	replicaMap := map[string]*proto.Replica{}
	for i := range replicas {
		nd := &replicas[i].NodeDesc
		addr := util.MakeUnresolvedAddr(nd.Address.Network, nd.Address.Address)
		addrs = append(addrs, addr)
		replicaMap[addr.String()] = &replicas[i].Replica
	}
	if len(addrs) == 0 {
		return noNodeAddrsAvailError{}
	}

	// TODO(pmattis): This needs to be tested. If it isn't set we'll
	// still route the request appropriately by key, but won't receive
	// RangeNotFoundErrors.
	args.Header().RaftID = raftID

	// Set RPC opts with stipulation that one of N RPCs must succeed.
	rpcOpts := rpc.Options{
		N:               1,
		Ordering:        order,
		SendNextTimeout: defaultSendNextTimeout,
		Timeout:         defaultRPCTimeout,
	}
	// getArgs clones the arguments on demand for all but the first replica.
	firstArgs := true
	getArgs := func(addr net.Addr) interface{} {
		var a proto.Request
		// Use the supplied args proto if this is our first address.
		if firstArgs {
			firstArgs = false
			a = args
		} else {
			// Otherwise, copy the args value and set the replica in the header.
			a = gogoproto.Clone(args).(proto.Request)
		}
		a.Header().Replica = *replicaMap[addr.String()]
		return a
	}
	// RPCs are sent asynchronously and there is no synchronized access to
	// the reply object, so we don't pass itself to rpcSend.
	// Otherwise there maybe a race case:
	// If the RPC call times out using our original reply object,
	// we must not use it any more; the rpc call might still return
	// and just write to it at any time.
	// args.CreateReply() should be cheaper than gogoproto.Clone which use reflect.
	getReply := func() interface{} {
		return args.CreateReply()
	}

	replies, err := ds.rpcSend(rpcOpts, "Node."+args.Method().String(),
		addrs, getArgs, getReply, ds.gossip.RPCContext)
	if err == nil {
		// Set content of replies[0] back to reply
		dst := reflect.ValueOf(reply).Elem()
		dst.Set(reflect.ValueOf(replies[0]).Elem())
	}

	return err
}
Пример #7
0
// verifyPermissions verifies that the requesting user (header.User)
// has permission to read/write (capabilities depend on method
// name). In the event that multiple permission configs apply to the
// key range implicated by the command, the lowest common denominator
// for permission. For example, if a scan crosses two permission
// configs, both configs must allow read permissions or the entire
// scan will fail.
func (ds *DistSender) verifyPermissions(args proto.Request) error {
	// The root user can always proceed.
	header := args.Header()
	if header.User == storage.UserRoot {
		return nil
	}
	// Check for admin methods.
	if proto.IsAdmin(args) {
		if header.User != storage.UserRoot {
			return util.Errorf("user %q cannot invoke admin command %s", header.User, args.Method())
		}
		return nil
	}
	// Get permissions map from gossip.
	configMap, err := ds.gossip.GetInfo(gossip.KeyConfigPermission)
	if err != nil {
		return util.Errorf("permissions not available via gossip")
	}
	if configMap == nil {
		return util.Errorf("perm configs not available; cannot execute %s", args.Method())
	}
	permMap := configMap.(storage.PrefixConfigMap)
	headerEnd := header.EndKey
	if len(headerEnd) == 0 {
		headerEnd = header.Key
	}
	// Visit PermConfig(s) which apply to the method's key range.
	//   - For each perm config which the range covers, verify read or writes
	//     are allowed as method requires.
	//   - Verify the permissions hierarchically; that is, if permissions aren't
	//     granted at the longest prefix, try next longest, then next, etc., up
	//     to and including the default prefix.
	//
	// TODO(spencer): it might make sense to visit prefixes from the
	//   shortest to longest instead for performance. Keep an eye on profiling
	//   for this code path as permission sets grow large.
	return permMap.VisitPrefixes(header.Key, headerEnd,
		func(start, end proto.Key, config interface{}) (bool, error) {
			hasPerm := false
			if err := permMap.VisitPrefixesHierarchically(start, func(start, end proto.Key, config interface{}) (bool, error) {
				perm := config.(*proto.PermConfig)
				if proto.IsRead(args) && !perm.CanRead(header.User) {
					return false, nil
				}
				if proto.IsWrite(args) && !perm.CanWrite(header.User) {
					return false, nil
				}
				// Return done = true, as permissions have been granted by this config.
				hasPerm = true
				return true, nil
			}); err != nil {
				return false, err
			}
			if !hasPerm {
				if len(header.EndKey) == 0 {
					return false, util.Errorf("user %q cannot invoke %s at %q", header.User, args.Method(), start)
				}
				return false, util.Errorf("user %q cannot invoke %s at %q-%q", header.User, args.Method(), start, end)
			}
			return false, nil
		})
}
Пример #8
0
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) (rErr error) {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")
	}

	committed := false
	// The very last thing we do before returning is move the applied index
	// forward, unless that has already happened as part of a successfully
	// committed batch.
	defer func() {
		if !committed {
			// We didn't commit the batch, but advance the last applied index nonetheless.
			if err := setAppliedIndex(r.rm.Engine(), r.Desc().RaftID, index); err != nil {
				rErr = newReplicaCorruptionError(
					util.Errorf("could not advance applied index"), err, rErr)
				return
			}
			atomic.StoreUint64(&r.appliedIndex, index)
		}
	}()

	if lease := r.getLease(); args.Method() != proto.InternalLeaderLease &&
		(!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) {
		// Verify the leader lease is held, unless this command is trying to
		// obtain it. Any other Raft command has had the leader lease held
		// by the replica at proposal time, but this may no more be the case.
		// Corruption aside, the most likely reason is a leadership change (the
		// most recent leader assumes responsibility for all past timestamps as
		// well). In that case, it's not valid to go ahead with the execution:
		// Writes must be aware of the last time the mutated key was read, and
		// since reads are served locally by the lease holder without going
		// through Raft, a read which was not taken into account may have been
		// served. Hence, we must retry at the current leader.
		//
		// It's crucial that we don't update the response cache for the error
		// returned below since the request is going to be retried with the
		// same ClientCmdID and would get the distributed sender stuck in an
		// infinite loop, retrieving a stale NotLeaderError over and over
		// again, even when proposing at the correct replica.
		return r.newNotLeaderError(lease)
	}

	// Anything happening from now on needs to enter the response cache.
	defer func() {
		// TODO(tamird,tschottdorf): according to #1400 we intend to set the reply
		// header's error as late as possible and in a central location. Range
		// commands still write to the header directly, but once they don't this
		// could be the authoritative location that sets the reply error for any-
		// thing that makes it into Raft. Note that we must set this prior to
		// signaling cmd.done below, or the waiting RPC handler might proceed
		// before we've updated its reply.
		//
		// It is important that the error is set before the reply is saved into
		// the response cache.
		reply.Header().SetGoError(rErr)

		if proto.IsWrite(args) {
			// No matter the result, add result to the response cache if this
			// is a write method. This must be done as part of the execution of
			// raft commands so that every replica maintains the same responses
			// to continue request idempotence, even if leadership changes.
			if err := r.respCache.PutResponse(args.Header().CmdID, reply); err != nil {
				rErr = newReplicaCorruptionError(
					util.Errorf("could not put to response cache"), err, rErr)
				return
			}
		}
	}()

	header := args.Header()

	// Check the response cache to ensure idempotency.
	if proto.IsWrite(args) {
		if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok && err == nil {
			if log.V(1) {
				log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID)
			}
			return err
		} else if ok && err != nil {
			return newReplicaCorruptionError(
				util.Errorf("could not read from response cache"), err)
		}
	}

	// Create a new batch for the command to ensure all or nothing semantics.
	batch := r.rm.Engine().NewBatch()
	defer batch.Close()

	// Create a engine.MVCCStats instance.
	ms := engine.MVCCStats{}

	// Execute the command; the error will also be set in the reply header.
	// TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not
	// touch the reply header's error field.
	intents, err := r.executeCmd(batch, &ms, args, reply)
	// If the execution of the command wasn't successful, stop here.
	if err != nil {
		return err
	}

	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return newReplicaCorruptionError(
			util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))
	}

	// Advance the applied index atomically within the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		return newReplicaCorruptionError(
			util.Errorf("could not update applied index"), err)
	}

	if proto.IsWrite(args) {
		// On success, flush the MVCC stats to the batch and commit.
		if err := r.stats.MergeMVCCStats(batch, &ms, header.Timestamp.WallTime); err != nil {
			return newReplicaCorruptionError(util.Errorf("could not merge MVCC stats"), err)
		}
		if err := batch.Commit(); err != nil {
			return newReplicaCorruptionError(util.Errorf("could not commit batch"), err)
		}
		committed = true
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// After successful commit, update cached stats and appliedIndex value.
		atomic.StoreUint64(&r.appliedIndex, index)
		// If the commit succeeded, potentially add range to split queue.
		r.maybeAddToSplitQueue()
		// Maybe update gossip configs on a put.
		switch args.(type) {
		case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest:
			if header.Key.Less(keys.SystemMax) {
				// We hold the lock already.
				r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool {
					return bytes.HasPrefix(header.Key, configPrefix)
				})
			}
		}
	}
	// On success and only on the replica on which this command originated,
	// resolve skipped intents asynchronously.
	if originNode == r.rm.RaftNodeID() {
		r.handleSkippedIntents(args, intents)
	}

	return nil
}
Пример #9
0
// applyRaftCommandInBatch executes the command in a batch engine and
// returns the batch containing the results. The caller is responsible
// for committing the batch, even on error.
func (r *Range) applyRaftCommandInBatch(ctx context.Context, index uint64, originNode proto.RaftNodeID,
	args proto.Request, ms *engine.MVCCStats) (engine.Engine, proto.Response, error) {
	// Create a new batch for the command to ensure all or nothing semantics.
	batch := r.rm.Engine().NewBatch()

	if lease := r.getLease(); args.Method() != proto.InternalLeaderLease &&
		(!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) {
		// Verify the leader lease is held, unless this command is trying to
		// obtain it. Any other Raft command has had the leader lease held
		// by the replica at proposal time, but this may no longer be the case.
		// Corruption aside, the most likely reason is a leadership change (the
		// most recent leader assumes responsibility for all past timestamps as
		// well). In that case, it's not valid to go ahead with the execution:
		// Writes must be aware of the last time the mutated key was read, and
		// since reads are served locally by the lease holder without going
		// through Raft, a read which was not taken into account may have been
		// served. Hence, we must retry at the current leader.
		//
		// It's crucial that we don't update the response cache for the error
		// returned below since the request is going to be retried with the
		// same ClientCmdID and would get the distributed sender stuck in an
		// infinite loop, retrieving a stale NotLeaderError over and over
		// again, even when proposing at the correct replica.
		return batch, nil, r.newNotLeaderError(lease, originNode)
	}

	// Check the response cache to ensure idempotency.
	if proto.IsWrite(args) {
		if reply, err := r.respCache.GetResponse(batch, args.Header().CmdID); err != nil {
			// Any error encountered while fetching the response cache entry means corruption.
			return batch, reply, newReplicaCorruptionError(util.Errorf("could not read from response cache"), err)
		} else if reply != nil {
			if log.V(1) {
				log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID)
			}
			// TODO(tamird): move this into the response cache itself
			defer func() { reply.Header().Error = nil }()
			// We successfully read from the response cache, so return whatever error
			// was present in the cached entry (if any).
			return batch, reply, reply.Header().GoError()
		}
	}

	// Execute the command.
	reply, intents, rErr := r.executeCmd(batch, ms, args)
	// Regardless of error, add result to the response cache if this is
	// a write method. This must be done as part of the execution of
	// raft commands so that every replica maintains the same responses
	// to continue request idempotence, even if leadership changes.
	if proto.IsWrite(args) {
		if rErr == nil {
			// If command was successful, flush the MVCC stats to the batch.
			if err := r.stats.MergeMVCCStats(batch, ms, args.Header().Timestamp.WallTime); err != nil {
				log.Fatalc(ctx, "setting mvcc stats in a batch should never fail: %s", err)
			}
		} else {
			// Otherwise, reset the batch to clear out partial execution and
			// prepare for the failed response cache entry.
			batch.Close()
			batch = r.rm.Engine().NewBatch()
		}
		// TODO(tamird): move this into the response cache itself
		if reply == nil {
			reply = args.CreateReply()
		}
		if reply.Header().Error != nil {
			panic("the world is on fire")
		}
		reply.Header().SetGoError(rErr)
		if err := r.respCache.PutResponse(batch, args.Header().CmdID, reply); err != nil {
			log.Fatalc(ctx, "putting a response cache entry in a batch should never fail: %s", err)
		}
		reply.Header().Error = nil
	}

	// If the execution of the command wasn't successful, stop here.
	if rErr != nil {
		return batch, reply, rErr
	}

	// On success and only on the replica on which this command originated,
	// resolve skipped intents asynchronously.
	if originNode == r.rm.RaftNodeID() {
		r.handleSkippedIntents(args, intents)
	}

	return batch, reply, nil
}
Пример #10
0
// sendRPC sends one or more RPCs to replicas from the supplied proto.Replica
// slice. First, replicas which have gossiped addresses are corralled (and
// rearranged depending on proximity and whether the request needs to go to a
// leader) and then sent via rpc.Send, with requirement that one RPC to a
// server must succeed. Returns an RPC error if the request could not be sent.
// Note that the reply may contain a higher level error and must be checked in
// addition to the RPC error.
func (ds *DistSender) sendRPC(trace *tracer.Trace, rangeID proto.RangeID, replicas replicaSlice, order rpc.OrderingPolicy,
	args proto.Request) (proto.Response, error) {
	if len(replicas) == 0 {
		// TODO(tschottdorf): this gets in the way of some tests. Consider
		// refactoring so that gossip is mocked out more easily. Provisional
		// code. return nil, util.Errorf("%s: replicas set is empty",
		// args.Method())
	}

	// Build a slice of replica addresses (if gossiped).
	var addrs []net.Addr
	replicaMap := map[string]*proto.Replica{}
	for i := range replicas {
		addr := replicas[i].NodeDesc.Address
		addrs = append(addrs, addr)
		replicaMap[addr.String()] = &replicas[i].Replica
	}
	if len(addrs) == 0 {
		// TODO(tschottdorf): see len(replicas) above.
		// return nil, noNodeAddrsAvailError{}
	}

	// TODO(pmattis): This needs to be tested. If it isn't set we'll
	// still route the request appropriately by key, but won't receive
	// RangeNotFoundErrors.
	args.Header().RangeID = rangeID

	// Set RPC opts with stipulation that one of N RPCs must succeed.
	rpcOpts := rpc.Options{
		N:               1,
		Ordering:        order,
		SendNextTimeout: defaultSendNextTimeout,
		Timeout:         defaultRPCTimeout,
		Trace:           trace,
	}
	// getArgs clones the arguments on demand for all but the first replica.
	firstArgs := true
	getArgs := func(addr net.Addr) gogoproto.Message {
		var a proto.Request
		// Use the supplied args proto if this is our first address.
		if firstArgs {
			firstArgs = false
			a = args
		} else {
			// Otherwise, copy the args value and set the replica in the header.
			a = gogoproto.Clone(args).(proto.Request)
		}
		if addr != nil {
			// TODO(tschottdorf): see len(replicas) above.
			a.Header().Replica = *replicaMap[addr.String()]
		}
		return a
	}
	// RPCs are sent asynchronously and there is no synchronized access to
	// the reply object, so we don't pass itself to RPCSend.
	// Otherwise there maybe a race case:
	// If the RPC call times out using our original reply object,
	// we must not use it any more; the rpc call might still return
	// and just write to it at any time.
	// args.CreateReply() should be cheaper than gogoproto.Clone which use reflect.
	getReply := func() gogoproto.Message {
		return args.CreateReply()
	}

	replies, err := ds.rpcSend(rpcOpts, "Node."+args.Method().String(),
		addrs, getArgs, getReply, ds.gossip.RPCContext)
	if err != nil {
		return nil, err
	}
	return replies[0].(proto.Response), nil
}
Пример #11
0
// executeCmd switches over the method and multiplexes to execute the
// appropriate storage API command. It returns an error and, for some calls
// such as inconsistent reads, the intents they skipped.
func (r *Range) executeCmd(batch engine.Engine, ms *engine.MVCCStats, args proto.Request, reply proto.Response) ([]proto.Intent, error) {
	// Verify key is contained within range here to catch any range split
	// or merge activity.
	header := args.Header()

	if err := r.checkCmdHeader(header); err != nil {
		reply.Header().SetGoError(err)
		return nil, err
	}

	// If a unittest filter was installed, check for an injected error; otherwise, continue.
	if TestingCommandFilter != nil && TestingCommandFilter(args, reply) {
		return nil, reply.Header().GoError()
	}

	var intents []proto.Intent
	switch tArgs := args.(type) {
	case *proto.GetRequest:
		intents = r.Get(batch, tArgs, reply.(*proto.GetResponse))
	case *proto.PutRequest:
		r.Put(batch, ms, tArgs, reply.(*proto.PutResponse))
	case *proto.ConditionalPutRequest:
		r.ConditionalPut(batch, ms, tArgs, reply.(*proto.ConditionalPutResponse))
	case *proto.IncrementRequest:
		r.Increment(batch, ms, tArgs, reply.(*proto.IncrementResponse))
	case *proto.DeleteRequest:
		r.Delete(batch, ms, tArgs, reply.(*proto.DeleteResponse))
	case *proto.DeleteRangeRequest:
		r.DeleteRange(batch, ms, tArgs, reply.(*proto.DeleteRangeResponse))
	case *proto.ScanRequest:
		intents = r.Scan(batch, tArgs, reply.(*proto.ScanResponse))
	case *proto.EndTransactionRequest:
		r.EndTransaction(batch, ms, tArgs, reply.(*proto.EndTransactionResponse))
	case *proto.InternalRangeLookupRequest:
		intents = r.InternalRangeLookup(batch, tArgs, reply.(*proto.InternalRangeLookupResponse))
	case *proto.InternalHeartbeatTxnRequest:
		r.InternalHeartbeatTxn(batch, ms, tArgs, reply.(*proto.InternalHeartbeatTxnResponse))
	case *proto.InternalGCRequest:
		r.InternalGC(batch, ms, tArgs, reply.(*proto.InternalGCResponse))
	case *proto.InternalPushTxnRequest:
		r.InternalPushTxn(batch, ms, tArgs, reply.(*proto.InternalPushTxnResponse))
	case *proto.InternalResolveIntentRequest:
		r.InternalResolveIntent(batch, ms, tArgs, reply.(*proto.InternalResolveIntentResponse))
	case *proto.InternalResolveIntentRangeRequest:
		r.InternalResolveIntentRange(batch, ms, tArgs, reply.(*proto.InternalResolveIntentRangeResponse))
	case *proto.InternalMergeRequest:
		r.InternalMerge(batch, ms, tArgs, reply.(*proto.InternalMergeResponse))
	case *proto.InternalTruncateLogRequest:
		r.InternalTruncateLog(batch, ms, tArgs, reply.(*proto.InternalTruncateLogResponse))
	case *proto.InternalLeaderLeaseRequest:
		r.InternalLeaderLease(batch, ms, tArgs, reply.(*proto.InternalLeaderLeaseResponse))
	default:
		return nil, util.Errorf("unrecognized command %s", args.Method())
	}

	if log.V(2) {
		log.Infof("executed %s command %+v: %+v", args.Method(), args, reply)
	}

	// Update the node clock with the serviced request. This maintains a
	// high water mark for all ops serviced, so that received ops
	// without a timestamp specified are guaranteed one higher than any
	// op already executed for overlapping keys.
	r.rm.Clock().Update(header.Timestamp)

	// Propagate the request timestamp (which may have changed).
	reply.Header().Timestamp = header.Timestamp

	err := reply.Header().GoError()

	// A ReadWithinUncertaintyIntervalError contains the timestamp of the value
	// that provoked the conflict. However, we forward the timestamp to the
	// node's time here. The reason is that the caller (which is always
	// transactional when this error occurs) in our implementation wants to
	// use this information to extract a timestamp after which reads from
	// the nodes are causally consistent with the transaction. This allows
	// the node to be classified as without further uncertain reads for the
	// remainder of the transaction.
	// See the comment on proto.Transaction.CertainNodes.
	if tErr, ok := reply.Header().GoError().(*proto.ReadWithinUncertaintyIntervalError); ok && tErr != nil {
		// Note that we can use this node's clock (which may be different from
		// other replicas') because this error attaches the existing timestamp
		// to the node itself when retrying.
		tErr.ExistingTimestamp.Forward(r.rm.Clock().Now())
	}

	// Return the error (if any) set in the reply.
	return intents, err
}