Esempio n. 1
0
func (db *testSender) sendOne(call proto.Call) {
	switch call.Args.(type) {
	case *proto.EndTransactionRequest:
		safeSetGoError(call.Reply, util.Errorf("%s method not supported", call.Method()))
		return
	}
	// Lookup range and direct request.
	header := call.Args.Header()
	if rng := db.store.LookupRange(header.Key, header.EndKey); rng != nil {
		header.RangeID = rng.Desc().RangeID
		replica := rng.GetReplica()
		if replica == nil {
			safeSetGoError(call.Reply, util.Errorf("own replica missing in range"))
		}
		header.Replica = *replica
		reply, err := db.store.ExecuteCmd(context.Background(), call.Args)
		if reply != nil {
			gogoproto.Merge(call.Reply, reply)
		}
		if call.Reply.Header().Error != nil {
			panic(proto.ErrorUnexpectedlySet)
		}
		if err != nil {
			call.Reply.Header().SetGoError(err)
		}
	} else {
		safeSetGoError(call.Reply, proto.NewRangeKeyMismatchError(header.Key, header.EndKey, nil))
	}
}
Esempio n. 2
0
// lookupReplica looks up replica by key [range]. Lookups are done
// by consulting each store in turn via Store.LookupRange(key).
// Returns RangeID and replica on success; RangeKeyMismatch error
// if not found.
// This is only for testing usage; performance doesn't matter.
func (ls *LocalSender) lookupReplica(start, end proto.Key) (rangeID proto.RangeID, replica *proto.Replica, err error) {
	ls.mu.RLock()
	defer ls.mu.RUnlock()
	var rng *storage.Replica
	for _, store := range ls.storeMap {
		rng = store.LookupReplica(start, end)
		if rng == nil {
			if tmpRng := store.LookupReplica(start, nil); tmpRng != nil {
				log.Warningf(fmt.Sprintf("range not contained in one range: [%s,%s), but have [%s,%s)", start, end, tmpRng.Desc().StartKey, tmpRng.Desc().EndKey))
			}
			continue
		}
		if replica == nil {
			rangeID = rng.Desc().RangeID
			replica = rng.GetReplica()
			continue
		}
		// Should never happen outside of tests.
		return 0, nil, util.Errorf(
			"range %+v exists on additional store: %+v", rng, store)
	}
	if replica == nil {
		err = proto.NewRangeKeyMismatchError(start, end, nil)
	}
	return rangeID, replica, err
}
Esempio n. 3
0
func (db *testDB) executeCmd(method string, args proto.Request, replyChan interface{}) {
	reply := reflect.New(reflect.TypeOf(replyChan).Elem().Elem()).Interface().(proto.Response)
	if rng := db.store.LookupRange(args.Header().Key, args.Header().EndKey); rng != nil {
		args.Header().Replica = *rng.Meta.GetReplica()
		db.store.ExecuteCmd(method, args, reply)
	} else {
		reply.Header().SetGoError(proto.NewRangeKeyMismatchError(args.Header().Key, args.Header().EndKey, nil))
	}
	reflect.ValueOf(replyChan).Send(reflect.ValueOf(reply))
}
Esempio n. 4
0
// lookupReplica looks up replica by key [range]. Lookups are done
// by consulting each store in turn via Store.LookupRange(key).
func (kv *LocalKV) lookupReplica(start, end engine.Key) (*proto.Replica, error) {
	kv.mu.RLock()
	defer kv.mu.RUnlock()
	for _, store := range kv.storeMap {
		if rng := store.LookupRange(start, end); rng != nil {
			return rng.Meta.GetReplica(), nil
		}
	}
	return nil, proto.NewRangeKeyMismatchError(start, end, nil)
}
Esempio n. 5
0
// InternalRangeLookup is used to look up RangeDescriptors - a RangeDescriptor
// is a metadata structure which describes the key range and replica locations
// of a distinct range in the cluster.
//
// RangeDescriptors are stored as values in the cockroach cluster's key-value
// store. However, they are always stored using special "Range Metadata keys",
// which are "ordinary" keys with a special prefix appended. The Range Metadata
// Key for an ordinary key can be generated with the `engine.RangeMetaKey(key)`
// function. The RangeDescriptor for the range which contains a given key can be
// retrieved by generating its Range Metadata Key and dispatching it to
// InternalRangeLookup.
//
// Note that the Range Metadata Key sent to InternalRangeLookup is NOT the key
// at which the desired RangeDescriptor is stored. Instead, this method returns
// the RangeDescriptor stored at the _lowest_ existing key which is _greater_
// than the given key. The returned RangeDescriptor will thus contain the
// ordinary key which was originally used to generate the Range Metadata Key
// sent to InternalRangeLookup.
//
// This method has an important optimization: instead of just returning the
// request RangeDescriptor, it also returns a slice of additional range
// descriptors immediately consecutive to the desired RangeDescriptor. This is
// intended to serve as a sort of caching pre-fetch, so that the requesting
// nodes can aggressively cache RangeDescriptors which are likely to be desired
// by their current workload.
func (r *Range) InternalRangeLookup(args *proto.InternalRangeLookupRequest, reply *proto.InternalRangeLookupResponse) {
	if err := engine.ValidateRangeMetaKey(args.Key); err != nil {
		reply.SetGoError(err)
		return
	}

	rangeCount := int64(args.MaxRanges)
	if rangeCount < 1 {
		reply.SetGoError(util.Errorf(
			"Range lookup specified invalid maximum range count %d: must be > 0", rangeCount))
		return
	}

	// We want to search for the metadata key just greater than args.Key.  Scan
	// for both the requested key and the keys immediately afterwards, up to
	// MaxRanges.
	metaPrefix := args.Key[:len(engine.KeyMeta1Prefix)]
	nextKey := engine.NextKey(args.Key)
	kvs, err := r.engine.Scan(nextKey, engine.PrefixEndKey(metaPrefix), rangeCount)
	if err != nil {
		reply.SetGoError(err)
		return
	}

	// The initial key must have the same metadata level prefix as we queried.
	if len(kvs) == 0 {
		// At this point the range has been verified to contain the requested
		// key, but no matching results were returned from the scan. This could
		// indicate a very bad system error, but for now we will just treat it
		// as a retryable Key Mismatch error.
		err := proto.NewRangeKeyMismatchError(args.Key, args.Key, r.Meta)
		reply.SetGoError(err)
		log.Errorf("InternalRangeLookup dispatched to correct range, but no matching RangeDescriptor was found. %s", err)
		return
	}

	// Decode all scanned range descriptors, stopping if a range is encountered
	// which does not have the same metadata prefix as the queried key.
	rds := make([]proto.RangeDescriptor, len(kvs))
	for i := range kvs {
		if err = gogoproto.Unmarshal(kvs[i].Value, &rds[i]); err != nil {
			reply.SetGoError(err)
			return
		}
	}

	reply.Ranges = rds
	return
}
Esempio n. 6
0
// AddCmd adds a command for execution on this range. The command's
// affected keys are verified to be contained within the range and the
// range's leadership is confirmed. The command is then dispatched
// either along the read-only execution path or the read-write Raft
// command queue. If wait is false, read-write commands are added to
// Raft without waiting for their completion.
func (r *Range) AddCmd(ctx context.Context, call proto.Call, wait bool) error {
	args, reply := call.Args, call.Reply
	header := args.Header()
	if !r.ContainsKeyRange(header.Key, header.EndKey) {
		err := proto.NewRangeKeyMismatchError(header.Key, header.EndKey, r.Desc())
		reply.Header().SetGoError(err)
		return err
	}

	// Differentiate between admin, read-only and read-write.
	if proto.IsAdmin(args) {
		return r.addAdminCmd(ctx, args, reply)
	} else if proto.IsReadOnly(args) {
		return r.addReadOnlyCmd(ctx, args, reply)
	}
	return r.addWriteCmd(ctx, args, reply, wait)
}
Esempio n. 7
0
// ExecuteCmd fetches a range based on the header's replica, assembles
// method, args & reply into a Raft Cmd struct and executes the
// command using the fetched range.
func (s *Store) ExecuteCmd(method string, args proto.Request, reply proto.Response) error {
	// If the request has a zero timestamp, initialize to this node's clock.
	header := args.Header()
	if header.Timestamp.WallTime == 0 && header.Timestamp.Logical == 0 {
		// Update both incoming and outgoing timestamps.
		now := s.clock.Now()
		args.Header().Timestamp = now
		reply.Header().Timestamp = now
	} else {
		// Otherwise, update our clock with the incoming request. This
		// advances the local node's clock to a high water mark from
		// amongst all nodes with which it has interacted. The update is
		// bounded by the max clock drift.
		_, err := s.clock.Update(header.Timestamp)
		if err != nil {
			return err
		}
	}

	// Verify specified range contains the command's implicated keys.
	rng, err := s.GetRange(header.Replica.RangeID)
	if err != nil {
		return err
	}
	if !rng.ContainsKeyRange(header.Key, header.EndKey) {
		return proto.NewRangeKeyMismatchError(header.Key, header.EndKey, rng.Meta)
	}
	if !rng.IsLeader() {
		// TODO(spencer): when we happen to know the leader, fill it in here via replica.
		return &proto.NotLeaderError{}
	}

	// Differentiate between read-only and read-write.
	if IsReadOnly(method) {
		return rng.ReadOnlyCmd(method, args, reply)
	}

	return rng.ReadWriteCmd(method, args, reply)
}
Esempio n. 8
0
// lookupReplica looks up replica by key [range]. Lookups are done
// by consulting each store in turn via Store.LookupRange(key).
// Returns RaftID and replica on success; RangeKeyMismatch error
// if not found.
// TODO(tschottdorf) with a very large number of stores, the LocalSender
// may want to avoid scanning the whole map of stores on each invocation.
func (ls *LocalSender) lookupReplica(start, end proto.Key) (raftID proto.RaftID, replica *proto.Replica, err error) {
	ls.mu.RLock()
	defer ls.mu.RUnlock()
	var rng *storage.Range
	for _, store := range ls.storeMap {
		rng = store.LookupRange(start, end)
		if rng == nil {
			continue
		}
		if replica == nil {
			raftID = rng.Desc().RaftID
			replica = rng.GetReplica()
			continue
		}
		// Should never happen outside of tests.
		return 0, nil, util.Errorf(
			"range %+v exists on additional store: %+v", rng, store)
	}
	if replica == nil {
		err = proto.NewRangeKeyMismatchError(start, end, nil)
	}
	return raftID, replica, err
}
Esempio n. 9
0
func (r *Range) checkCmdHeader(header *proto.RequestHeader) error {
	if !r.ContainsKeyRange(header.Key, header.EndKey) {
		return proto.NewRangeKeyMismatchError(header.Key, header.EndKey, r.Desc())
	}
	return nil
}
Esempio n. 10
0
// AdminSplit divides the range into into two ranges, using either
// args.SplitKey (if provided) or an internally computed key that aims to
// roughly equipartition the range by size. The split is done inside of
// a distributed txn which writes updated and new range descriptors, and
// updates the range addressing metadata. The handover of responsibility for
// the reassigned key range is carried out seamlessly through a split trigger
// carried out as part of the commit of that transaction.
func (r *Range) AdminSplit(args *proto.AdminSplitRequest, reply *proto.AdminSplitResponse) {
	// Only allow a single split per range at a time.
	r.metaLock.Lock()
	defer r.metaLock.Unlock()

	// Determine split key if not provided with args. This scan is
	// allowed to be relatively slow because admin commands don't block
	// other commands.
	desc := r.Desc()
	splitKey := proto.Key(args.SplitKey)
	if len(splitKey) == 0 {
		snap := r.rm.NewSnapshot()
		defer snap.Close()
		var err error
		if splitKey, err = engine.MVCCFindSplitKey(snap, desc.RaftID, desc.StartKey, desc.EndKey); err != nil {
			reply.SetGoError(util.Errorf("unable to determine split key: %s", err))
			return
		}
	}
	// First verify this condition so that it will not return
	// proto.NewRangeKeyMismatchError if splitKey equals to desc.EndKey,
	// otherwise it will cause infinite retry loop.
	if splitKey.Equal(desc.StartKey) || splitKey.Equal(desc.EndKey) {
		reply.SetGoError(util.Errorf("range is already split at key %s", splitKey))
		return
	}
	// Verify some properties of split key.
	if !r.ContainsKey(splitKey) {
		reply.SetGoError(proto.NewRangeKeyMismatchError(splitKey, splitKey, desc))
		return
	}
	if !engine.IsValidSplitKey(splitKey) {
		reply.SetGoError(util.Errorf("cannot split range at key %s", splitKey))
		return
	}

	// Create new range descriptor with newly-allocated replica IDs and Raft IDs.
	newDesc, err := r.rm.NewRangeDescriptor(splitKey, desc.EndKey, desc.Replicas)
	if err != nil {
		reply.SetGoError(util.Errorf("unable to allocate new range descriptor: %s", err))
		return
	}

	// Init updated version of existing range descriptor.
	updatedDesc := *desc
	updatedDesc.EndKey = splitKey

	log.Infof("initiating a split of %s at key %s", r, splitKey)

	if err = r.rm.DB().Txn(func(txn *client.Txn) error {
		// Create range descriptor for second half of split.
		// Note that this put must go first in order to locate the
		// transaction record on the correct range.
		b := &client.Batch{}
		desc1Key := keys.RangeDescriptorKey(newDesc.StartKey)
		if err := updateRangeDescriptor(b, desc1Key, nil, newDesc); err != nil {
			return err
		}
		// Update existing range descriptor for first half of split.
		desc2Key := keys.RangeDescriptorKey(updatedDesc.StartKey)
		if err := updateRangeDescriptor(b, desc2Key, desc, &updatedDesc); err != nil {
			return err
		}
		// Update range descriptor addressing record(s).
		if err := splitRangeAddressing(b, newDesc, &updatedDesc); err != nil {
			return err
		}
		if err := txn.Run(b); err != nil {
			return err
		}
		// Update the RangeTree.
		b = &client.Batch{}
		if err := InsertRange(txn, b, newDesc.StartKey); err != nil {
			return err
		}
		// End the transaction manually, instead of letting RunTransaction
		// loop do it, in order to provide a split trigger.
		b.InternalAddCall(proto.Call{
			Args: &proto.EndTransactionRequest{
				RequestHeader: proto.RequestHeader{Key: args.Key},
				Commit:        true,
				InternalCommitTrigger: &proto.InternalCommitTrigger{
					SplitTrigger: &proto.SplitTrigger{
						UpdatedDesc: updatedDesc,
						NewDesc:     *newDesc,
					},
					Intents: []proto.Key{desc1Key, desc2Key},
				},
			},
			Reply: &proto.EndTransactionResponse{},
		})
		return txn.Run(b)
	}); err != nil {
		reply.SetGoError(util.Errorf("split at key %s failed: %s", splitKey, err))
	}
}
Esempio n. 11
0
// InternalRangeLookup is used to look up RangeDescriptors - a RangeDescriptor
// is a metadata structure which describes the key range and replica locations
// of a distinct range in the cluster.
//
// RangeDescriptors are stored as values in the cockroach cluster's key-value
// store. However, they are always stored using special "Range Metadata keys",
// which are "ordinary" keys with a special prefix prepended. The Range Metadata
// Key for an ordinary key can be generated with the `keys.RangeMetaKey(key)`
// function. The RangeDescriptor for the range which contains a given key can be
// retrieved by generating its Range Metadata Key and dispatching it to
// InternalRangeLookup.
//
// Note that the Range Metadata Key sent to InternalRangeLookup is NOT the key
// at which the desired RangeDescriptor is stored. Instead, this method returns
// the RangeDescriptor stored at the _lowest_ existing key which is _greater_
// than the given key. The returned RangeDescriptor will thus contain the
// ordinary key which was originally used to generate the Range Metadata Key
// sent to InternalRangeLookup.
//
// The "Range Metadata Key" for a range is built by appending the end key of
// the range to the meta[12] prefix because the RocksDB iterator only supports
// a Seek() interface which acts as a Ceil(). Using the start key of the range
// would cause Seek() to find the key after the meta indexing record we're
// looking for, which would result in having to back the iterator up, an option
// which is both less efficient and not available in all cases.
//
// This method has an important optimization: instead of just returning the
// request RangeDescriptor, it also returns a slice of additional range
// descriptors immediately consecutive to the desired RangeDescriptor. This is
// intended to serve as a sort of caching pre-fetch, so that the requesting
// nodes can aggressively cache RangeDescriptors which are likely to be desired
// by their current workload.
func (r *Range) InternalRangeLookup(batch engine.Engine, args *proto.InternalRangeLookupRequest, reply *proto.InternalRangeLookupResponse) []proto.Intent {
	if err := keys.ValidateRangeMetaKey(args.Key); err != nil {
		reply.SetGoError(err)
		return nil
	}

	rangeCount := int64(args.MaxRanges)
	if rangeCount < 1 {
		reply.SetGoError(util.Errorf(
			"Range lookup specified invalid maximum range count %d: must be > 0", rangeCount))
		return nil
	}
	if args.IgnoreIntents {
		rangeCount = 1 // simplify lookup because we may have to retry to read new
	}

	// We want to search for the metadata key just greater than args.Key. Scan
	// for both the requested key and the keys immediately afterwards, up to
	// MaxRanges.
	startKey, endKey := keys.MetaScanBounds(args.Key)
	// Scan inconsistently. Any intents encountered are bundled up, but other-
	// wise ignored.
	kvs, intents, err := engine.MVCCScan(batch, startKey, endKey, rangeCount,
		args.Timestamp, false /* !consistent */, args.Txn)
	if err != nil {
		// An error here would likely amount to something seriously going
		// wrong.
		reply.SetGoError(err)
		return nil
	}
	if args.IgnoreIntents && len(intents) > 0 {
		// NOTE (subtle): in general, we want to try to clean up dangling
		// intents on meta records. However, if we're in the process of
		// cleaning up a dangling intent on a meta record by pushing the
		// transaction, we don't want to create an infinite loop:
		//
		// intent! -> push-txn -> range-lookup -> intent! -> etc...
		//
		// Instead we want:
		//
		// intent! -> push-txn -> range-lookup -> ignore intent, return old/new ranges
		//
		// On the range-lookup from a push transaction, we therefore
		// want to suppress WriteIntentErrors and return a value
		// anyway. But which value? We don't know whether the range
		// update succeeded or failed, but if we don't return the
		// correct range descriptor we may not be able to find the
		// transaction to push. Since we cannot know the correct answer,
		// we choose randomly between the pre- and post- transaction
		// values. If we guess wrong, the client will try again and get
		// the other value (within a few tries).
		if rand.Intn(2) == 0 {
			key, txn := intents[0].Key, &intents[0].Txn
			val, _, err := engine.MVCCGet(batch, key, txn.Timestamp, true, txn)
			if err != nil {
				reply.SetGoError(err)
				return nil
			}
			kvs = []proto.KeyValue{{Key: key, Value: *val}}
		}
	}

	if len(kvs) == 0 {
		// No matching results were returned from the scan. This could
		// indicate a very bad system error, but for now we will just
		// treat it as a retryable Key Mismatch error.
		err := proto.NewRangeKeyMismatchError(args.Key, args.EndKey, r.Desc())
		reply.SetGoError(err)
		log.Errorf("InternalRangeLookup dispatched to correct range, but no matching RangeDescriptor was found. %s", err)
		return nil
	}

	// Decode all scanned range descriptors, stopping if a range is encountered
	// which does not have the same metadata prefix as the queried key.
	rds := make([]proto.RangeDescriptor, len(kvs))
	for i := range kvs {
		// TODO(tschottdorf) Candidate for a ReplicaCorruptionError, once we
		// introduce that.
		if err = gogoproto.Unmarshal(kvs[i].Value.Bytes, &rds[i]); err != nil {
			reply.SetGoError(err)
			return nil
		}
	}

	reply.Ranges = rds
	return intents
}
Esempio n. 12
0
// executeCmd switches over the method and multiplexes to execute the
// appropriate storage API command. It returns an error and, for some calls
// such as inconsistent reads, the intents they skipped.
func (r *Range) executeCmd(batch engine.Engine, ms *engine.MVCCStats, args proto.Request, reply proto.Response) ([]proto.Intent, error) {
	// Verify key is contained within range here to catch any range split
	// or merge activity.
	header := args.Header()
	if !r.ContainsKeyRange(header.Key, header.EndKey) {
		err := proto.NewRangeKeyMismatchError(header.Key, header.EndKey, r.Desc())
		reply.Header().SetGoError(err)
		return nil, err
	}

	// If a unittest filter was installed, check for an injected error; otherwise, continue.
	if TestingCommandFilter != nil && TestingCommandFilter(args, reply) {
		return nil, reply.Header().GoError()
	}

	var intents []proto.Intent
	switch tArgs := args.(type) {
	case *proto.GetRequest:
		intents = r.Get(batch, tArgs, reply.(*proto.GetResponse))
	case *proto.PutRequest:
		r.Put(batch, ms, tArgs, reply.(*proto.PutResponse))
	case *proto.ConditionalPutRequest:
		r.ConditionalPut(batch, ms, tArgs, reply.(*proto.ConditionalPutResponse))
	case *proto.IncrementRequest:
		r.Increment(batch, ms, tArgs, reply.(*proto.IncrementResponse))
	case *proto.DeleteRequest:
		r.Delete(batch, ms, tArgs, reply.(*proto.DeleteResponse))
	case *proto.DeleteRangeRequest:
		r.DeleteRange(batch, ms, tArgs, reply.(*proto.DeleteRangeResponse))
	case *proto.ScanRequest:
		intents = r.Scan(batch, tArgs, reply.(*proto.ScanResponse))
	case *proto.EndTransactionRequest:
		r.EndTransaction(batch, ms, tArgs, reply.(*proto.EndTransactionResponse))
	case *proto.InternalRangeLookupRequest:
		intents = r.InternalRangeLookup(batch, tArgs, reply.(*proto.InternalRangeLookupResponse))
	case *proto.InternalHeartbeatTxnRequest:
		r.InternalHeartbeatTxn(batch, ms, tArgs, reply.(*proto.InternalHeartbeatTxnResponse))
	case *proto.InternalGCRequest:
		r.InternalGC(batch, ms, tArgs, reply.(*proto.InternalGCResponse))
	case *proto.InternalPushTxnRequest:
		r.InternalPushTxn(batch, ms, tArgs, reply.(*proto.InternalPushTxnResponse))
	case *proto.InternalResolveIntentRequest:
		r.InternalResolveIntent(batch, ms, tArgs, reply.(*proto.InternalResolveIntentResponse))
	case *proto.InternalMergeRequest:
		r.InternalMerge(batch, ms, tArgs, reply.(*proto.InternalMergeResponse))
	case *proto.InternalTruncateLogRequest:
		r.InternalTruncateLog(batch, ms, tArgs, reply.(*proto.InternalTruncateLogResponse))
	case *proto.InternalLeaderLeaseRequest:
		r.InternalLeaderLease(batch, ms, tArgs, reply.(*proto.InternalLeaderLeaseResponse))
	default:
		return nil, util.Errorf("unrecognized command %s", args.Method())
	}

	if log.V(2) {
		log.Infof("executed %s command %+v: %+v", args.Method(), args, reply)
	}

	// Update the node clock with the serviced request. This maintains a
	// high water mark for all ops serviced, so that received ops
	// without a timestamp specified are guaranteed one higher than any
	// op already executed for overlapping keys.
	r.rm.Clock().Update(header.Timestamp)

	// Propagate the request timestamp (which may have changed).
	reply.Header().Timestamp = header.Timestamp

	err := reply.Header().GoError()

	// A ReadWithinUncertaintyIntervalError contains the timestamp of the value
	// that provoked the conflict. However, we forward the timestamp to the
	// node's time here. The reason is that the caller (which is always
	// transactional when this error occurs) in our implementation wants to
	// use this information to extract a timestamp after which reads from
	// the nodes are causally consistent with the transaction. This allows
	// the node to be classified as without further uncertain reads for the
	// remainder of the transaction.
	// See the comment on proto.Transaction.CertainNodes.
	if tErr, ok := reply.Header().GoError().(*proto.ReadWithinUncertaintyIntervalError); ok && tErr != nil {
		// Note that we can use this node's clock (which may be different from
		// other replicas') because this error attaches the existing timestamp
		// to the node itself when retrying.
		tErr.ExistingTimestamp.Forward(r.rm.Clock().Now())
	}

	// Return the error (if any) set in the reply.
	return intents, err
}