func (db *testSender) sendOne(call proto.Call) { switch call.Args.(type) { case *proto.EndTransactionRequest: safeSetGoError(call.Reply, util.Errorf("%s method not supported", call.Method())) return } // Lookup range and direct request. header := call.Args.Header() if rng := db.store.LookupRange(header.Key, header.EndKey); rng != nil { header.RangeID = rng.Desc().RangeID replica := rng.GetReplica() if replica == nil { safeSetGoError(call.Reply, util.Errorf("own replica missing in range")) } header.Replica = *replica reply, err := db.store.ExecuteCmd(context.Background(), call.Args) if reply != nil { gogoproto.Merge(call.Reply, reply) } if call.Reply.Header().Error != nil { panic(proto.ErrorUnexpectedlySet) } if err != nil { call.Reply.Header().SetGoError(err) } } else { safeSetGoError(call.Reply, proto.NewRangeKeyMismatchError(header.Key, header.EndKey, nil)) } }
// lookupReplica looks up replica by key [range]. Lookups are done // by consulting each store in turn via Store.LookupRange(key). // Returns RangeID and replica on success; RangeKeyMismatch error // if not found. // This is only for testing usage; performance doesn't matter. func (ls *LocalSender) lookupReplica(start, end proto.Key) (rangeID proto.RangeID, replica *proto.Replica, err error) { ls.mu.RLock() defer ls.mu.RUnlock() var rng *storage.Replica for _, store := range ls.storeMap { rng = store.LookupReplica(start, end) if rng == nil { if tmpRng := store.LookupReplica(start, nil); tmpRng != nil { log.Warningf(fmt.Sprintf("range not contained in one range: [%s,%s), but have [%s,%s)", start, end, tmpRng.Desc().StartKey, tmpRng.Desc().EndKey)) } continue } if replica == nil { rangeID = rng.Desc().RangeID replica = rng.GetReplica() continue } // Should never happen outside of tests. return 0, nil, util.Errorf( "range %+v exists on additional store: %+v", rng, store) } if replica == nil { err = proto.NewRangeKeyMismatchError(start, end, nil) } return rangeID, replica, err }
func (db *testDB) executeCmd(method string, args proto.Request, replyChan interface{}) { reply := reflect.New(reflect.TypeOf(replyChan).Elem().Elem()).Interface().(proto.Response) if rng := db.store.LookupRange(args.Header().Key, args.Header().EndKey); rng != nil { args.Header().Replica = *rng.Meta.GetReplica() db.store.ExecuteCmd(method, args, reply) } else { reply.Header().SetGoError(proto.NewRangeKeyMismatchError(args.Header().Key, args.Header().EndKey, nil)) } reflect.ValueOf(replyChan).Send(reflect.ValueOf(reply)) }
// lookupReplica looks up replica by key [range]. Lookups are done // by consulting each store in turn via Store.LookupRange(key). func (kv *LocalKV) lookupReplica(start, end engine.Key) (*proto.Replica, error) { kv.mu.RLock() defer kv.mu.RUnlock() for _, store := range kv.storeMap { if rng := store.LookupRange(start, end); rng != nil { return rng.Meta.GetReplica(), nil } } return nil, proto.NewRangeKeyMismatchError(start, end, nil) }
// InternalRangeLookup is used to look up RangeDescriptors - a RangeDescriptor // is a metadata structure which describes the key range and replica locations // of a distinct range in the cluster. // // RangeDescriptors are stored as values in the cockroach cluster's key-value // store. However, they are always stored using special "Range Metadata keys", // which are "ordinary" keys with a special prefix appended. The Range Metadata // Key for an ordinary key can be generated with the `engine.RangeMetaKey(key)` // function. The RangeDescriptor for the range which contains a given key can be // retrieved by generating its Range Metadata Key and dispatching it to // InternalRangeLookup. // // Note that the Range Metadata Key sent to InternalRangeLookup is NOT the key // at which the desired RangeDescriptor is stored. Instead, this method returns // the RangeDescriptor stored at the _lowest_ existing key which is _greater_ // than the given key. The returned RangeDescriptor will thus contain the // ordinary key which was originally used to generate the Range Metadata Key // sent to InternalRangeLookup. // // This method has an important optimization: instead of just returning the // request RangeDescriptor, it also returns a slice of additional range // descriptors immediately consecutive to the desired RangeDescriptor. This is // intended to serve as a sort of caching pre-fetch, so that the requesting // nodes can aggressively cache RangeDescriptors which are likely to be desired // by their current workload. func (r *Range) InternalRangeLookup(args *proto.InternalRangeLookupRequest, reply *proto.InternalRangeLookupResponse) { if err := engine.ValidateRangeMetaKey(args.Key); err != nil { reply.SetGoError(err) return } rangeCount := int64(args.MaxRanges) if rangeCount < 1 { reply.SetGoError(util.Errorf( "Range lookup specified invalid maximum range count %d: must be > 0", rangeCount)) return } // We want to search for the metadata key just greater than args.Key. Scan // for both the requested key and the keys immediately afterwards, up to // MaxRanges. metaPrefix := args.Key[:len(engine.KeyMeta1Prefix)] nextKey := engine.NextKey(args.Key) kvs, err := r.engine.Scan(nextKey, engine.PrefixEndKey(metaPrefix), rangeCount) if err != nil { reply.SetGoError(err) return } // The initial key must have the same metadata level prefix as we queried. if len(kvs) == 0 { // At this point the range has been verified to contain the requested // key, but no matching results were returned from the scan. This could // indicate a very bad system error, but for now we will just treat it // as a retryable Key Mismatch error. err := proto.NewRangeKeyMismatchError(args.Key, args.Key, r.Meta) reply.SetGoError(err) log.Errorf("InternalRangeLookup dispatched to correct range, but no matching RangeDescriptor was found. %s", err) return } // Decode all scanned range descriptors, stopping if a range is encountered // which does not have the same metadata prefix as the queried key. rds := make([]proto.RangeDescriptor, len(kvs)) for i := range kvs { if err = gogoproto.Unmarshal(kvs[i].Value, &rds[i]); err != nil { reply.SetGoError(err) return } } reply.Ranges = rds return }
// AddCmd adds a command for execution on this range. The command's // affected keys are verified to be contained within the range and the // range's leadership is confirmed. The command is then dispatched // either along the read-only execution path or the read-write Raft // command queue. If wait is false, read-write commands are added to // Raft without waiting for their completion. func (r *Range) AddCmd(ctx context.Context, call proto.Call, wait bool) error { args, reply := call.Args, call.Reply header := args.Header() if !r.ContainsKeyRange(header.Key, header.EndKey) { err := proto.NewRangeKeyMismatchError(header.Key, header.EndKey, r.Desc()) reply.Header().SetGoError(err) return err } // Differentiate between admin, read-only and read-write. if proto.IsAdmin(args) { return r.addAdminCmd(ctx, args, reply) } else if proto.IsReadOnly(args) { return r.addReadOnlyCmd(ctx, args, reply) } return r.addWriteCmd(ctx, args, reply, wait) }
// ExecuteCmd fetches a range based on the header's replica, assembles // method, args & reply into a Raft Cmd struct and executes the // command using the fetched range. func (s *Store) ExecuteCmd(method string, args proto.Request, reply proto.Response) error { // If the request has a zero timestamp, initialize to this node's clock. header := args.Header() if header.Timestamp.WallTime == 0 && header.Timestamp.Logical == 0 { // Update both incoming and outgoing timestamps. now := s.clock.Now() args.Header().Timestamp = now reply.Header().Timestamp = now } else { // Otherwise, update our clock with the incoming request. This // advances the local node's clock to a high water mark from // amongst all nodes with which it has interacted. The update is // bounded by the max clock drift. _, err := s.clock.Update(header.Timestamp) if err != nil { return err } } // Verify specified range contains the command's implicated keys. rng, err := s.GetRange(header.Replica.RangeID) if err != nil { return err } if !rng.ContainsKeyRange(header.Key, header.EndKey) { return proto.NewRangeKeyMismatchError(header.Key, header.EndKey, rng.Meta) } if !rng.IsLeader() { // TODO(spencer): when we happen to know the leader, fill it in here via replica. return &proto.NotLeaderError{} } // Differentiate between read-only and read-write. if IsReadOnly(method) { return rng.ReadOnlyCmd(method, args, reply) } return rng.ReadWriteCmd(method, args, reply) }
// lookupReplica looks up replica by key [range]. Lookups are done // by consulting each store in turn via Store.LookupRange(key). // Returns RaftID and replica on success; RangeKeyMismatch error // if not found. // TODO(tschottdorf) with a very large number of stores, the LocalSender // may want to avoid scanning the whole map of stores on each invocation. func (ls *LocalSender) lookupReplica(start, end proto.Key) (raftID proto.RaftID, replica *proto.Replica, err error) { ls.mu.RLock() defer ls.mu.RUnlock() var rng *storage.Range for _, store := range ls.storeMap { rng = store.LookupRange(start, end) if rng == nil { continue } if replica == nil { raftID = rng.Desc().RaftID replica = rng.GetReplica() continue } // Should never happen outside of tests. return 0, nil, util.Errorf( "range %+v exists on additional store: %+v", rng, store) } if replica == nil { err = proto.NewRangeKeyMismatchError(start, end, nil) } return raftID, replica, err }
func (r *Range) checkCmdHeader(header *proto.RequestHeader) error { if !r.ContainsKeyRange(header.Key, header.EndKey) { return proto.NewRangeKeyMismatchError(header.Key, header.EndKey, r.Desc()) } return nil }
// AdminSplit divides the range into into two ranges, using either // args.SplitKey (if provided) or an internally computed key that aims to // roughly equipartition the range by size. The split is done inside of // a distributed txn which writes updated and new range descriptors, and // updates the range addressing metadata. The handover of responsibility for // the reassigned key range is carried out seamlessly through a split trigger // carried out as part of the commit of that transaction. func (r *Range) AdminSplit(args *proto.AdminSplitRequest, reply *proto.AdminSplitResponse) { // Only allow a single split per range at a time. r.metaLock.Lock() defer r.metaLock.Unlock() // Determine split key if not provided with args. This scan is // allowed to be relatively slow because admin commands don't block // other commands. desc := r.Desc() splitKey := proto.Key(args.SplitKey) if len(splitKey) == 0 { snap := r.rm.NewSnapshot() defer snap.Close() var err error if splitKey, err = engine.MVCCFindSplitKey(snap, desc.RaftID, desc.StartKey, desc.EndKey); err != nil { reply.SetGoError(util.Errorf("unable to determine split key: %s", err)) return } } // First verify this condition so that it will not return // proto.NewRangeKeyMismatchError if splitKey equals to desc.EndKey, // otherwise it will cause infinite retry loop. if splitKey.Equal(desc.StartKey) || splitKey.Equal(desc.EndKey) { reply.SetGoError(util.Errorf("range is already split at key %s", splitKey)) return } // Verify some properties of split key. if !r.ContainsKey(splitKey) { reply.SetGoError(proto.NewRangeKeyMismatchError(splitKey, splitKey, desc)) return } if !engine.IsValidSplitKey(splitKey) { reply.SetGoError(util.Errorf("cannot split range at key %s", splitKey)) return } // Create new range descriptor with newly-allocated replica IDs and Raft IDs. newDesc, err := r.rm.NewRangeDescriptor(splitKey, desc.EndKey, desc.Replicas) if err != nil { reply.SetGoError(util.Errorf("unable to allocate new range descriptor: %s", err)) return } // Init updated version of existing range descriptor. updatedDesc := *desc updatedDesc.EndKey = splitKey log.Infof("initiating a split of %s at key %s", r, splitKey) if err = r.rm.DB().Txn(func(txn *client.Txn) error { // Create range descriptor for second half of split. // Note that this put must go first in order to locate the // transaction record on the correct range. b := &client.Batch{} desc1Key := keys.RangeDescriptorKey(newDesc.StartKey) if err := updateRangeDescriptor(b, desc1Key, nil, newDesc); err != nil { return err } // Update existing range descriptor for first half of split. desc2Key := keys.RangeDescriptorKey(updatedDesc.StartKey) if err := updateRangeDescriptor(b, desc2Key, desc, &updatedDesc); err != nil { return err } // Update range descriptor addressing record(s). if err := splitRangeAddressing(b, newDesc, &updatedDesc); err != nil { return err } if err := txn.Run(b); err != nil { return err } // Update the RangeTree. b = &client.Batch{} if err := InsertRange(txn, b, newDesc.StartKey); err != nil { return err } // End the transaction manually, instead of letting RunTransaction // loop do it, in order to provide a split trigger. b.InternalAddCall(proto.Call{ Args: &proto.EndTransactionRequest{ RequestHeader: proto.RequestHeader{Key: args.Key}, Commit: true, InternalCommitTrigger: &proto.InternalCommitTrigger{ SplitTrigger: &proto.SplitTrigger{ UpdatedDesc: updatedDesc, NewDesc: *newDesc, }, Intents: []proto.Key{desc1Key, desc2Key}, }, }, Reply: &proto.EndTransactionResponse{}, }) return txn.Run(b) }); err != nil { reply.SetGoError(util.Errorf("split at key %s failed: %s", splitKey, err)) } }
// InternalRangeLookup is used to look up RangeDescriptors - a RangeDescriptor // is a metadata structure which describes the key range and replica locations // of a distinct range in the cluster. // // RangeDescriptors are stored as values in the cockroach cluster's key-value // store. However, they are always stored using special "Range Metadata keys", // which are "ordinary" keys with a special prefix prepended. The Range Metadata // Key for an ordinary key can be generated with the `keys.RangeMetaKey(key)` // function. The RangeDescriptor for the range which contains a given key can be // retrieved by generating its Range Metadata Key and dispatching it to // InternalRangeLookup. // // Note that the Range Metadata Key sent to InternalRangeLookup is NOT the key // at which the desired RangeDescriptor is stored. Instead, this method returns // the RangeDescriptor stored at the _lowest_ existing key which is _greater_ // than the given key. The returned RangeDescriptor will thus contain the // ordinary key which was originally used to generate the Range Metadata Key // sent to InternalRangeLookup. // // The "Range Metadata Key" for a range is built by appending the end key of // the range to the meta[12] prefix because the RocksDB iterator only supports // a Seek() interface which acts as a Ceil(). Using the start key of the range // would cause Seek() to find the key after the meta indexing record we're // looking for, which would result in having to back the iterator up, an option // which is both less efficient and not available in all cases. // // This method has an important optimization: instead of just returning the // request RangeDescriptor, it also returns a slice of additional range // descriptors immediately consecutive to the desired RangeDescriptor. This is // intended to serve as a sort of caching pre-fetch, so that the requesting // nodes can aggressively cache RangeDescriptors which are likely to be desired // by their current workload. func (r *Range) InternalRangeLookup(batch engine.Engine, args *proto.InternalRangeLookupRequest, reply *proto.InternalRangeLookupResponse) []proto.Intent { if err := keys.ValidateRangeMetaKey(args.Key); err != nil { reply.SetGoError(err) return nil } rangeCount := int64(args.MaxRanges) if rangeCount < 1 { reply.SetGoError(util.Errorf( "Range lookup specified invalid maximum range count %d: must be > 0", rangeCount)) return nil } if args.IgnoreIntents { rangeCount = 1 // simplify lookup because we may have to retry to read new } // We want to search for the metadata key just greater than args.Key. Scan // for both the requested key and the keys immediately afterwards, up to // MaxRanges. startKey, endKey := keys.MetaScanBounds(args.Key) // Scan inconsistently. Any intents encountered are bundled up, but other- // wise ignored. kvs, intents, err := engine.MVCCScan(batch, startKey, endKey, rangeCount, args.Timestamp, false /* !consistent */, args.Txn) if err != nil { // An error here would likely amount to something seriously going // wrong. reply.SetGoError(err) return nil } if args.IgnoreIntents && len(intents) > 0 { // NOTE (subtle): in general, we want to try to clean up dangling // intents on meta records. However, if we're in the process of // cleaning up a dangling intent on a meta record by pushing the // transaction, we don't want to create an infinite loop: // // intent! -> push-txn -> range-lookup -> intent! -> etc... // // Instead we want: // // intent! -> push-txn -> range-lookup -> ignore intent, return old/new ranges // // On the range-lookup from a push transaction, we therefore // want to suppress WriteIntentErrors and return a value // anyway. But which value? We don't know whether the range // update succeeded or failed, but if we don't return the // correct range descriptor we may not be able to find the // transaction to push. Since we cannot know the correct answer, // we choose randomly between the pre- and post- transaction // values. If we guess wrong, the client will try again and get // the other value (within a few tries). if rand.Intn(2) == 0 { key, txn := intents[0].Key, &intents[0].Txn val, _, err := engine.MVCCGet(batch, key, txn.Timestamp, true, txn) if err != nil { reply.SetGoError(err) return nil } kvs = []proto.KeyValue{{Key: key, Value: *val}} } } if len(kvs) == 0 { // No matching results were returned from the scan. This could // indicate a very bad system error, but for now we will just // treat it as a retryable Key Mismatch error. err := proto.NewRangeKeyMismatchError(args.Key, args.EndKey, r.Desc()) reply.SetGoError(err) log.Errorf("InternalRangeLookup dispatched to correct range, but no matching RangeDescriptor was found. %s", err) return nil } // Decode all scanned range descriptors, stopping if a range is encountered // which does not have the same metadata prefix as the queried key. rds := make([]proto.RangeDescriptor, len(kvs)) for i := range kvs { // TODO(tschottdorf) Candidate for a ReplicaCorruptionError, once we // introduce that. if err = gogoproto.Unmarshal(kvs[i].Value.Bytes, &rds[i]); err != nil { reply.SetGoError(err) return nil } } reply.Ranges = rds return intents }
// executeCmd switches over the method and multiplexes to execute the // appropriate storage API command. It returns an error and, for some calls // such as inconsistent reads, the intents they skipped. func (r *Range) executeCmd(batch engine.Engine, ms *engine.MVCCStats, args proto.Request, reply proto.Response) ([]proto.Intent, error) { // Verify key is contained within range here to catch any range split // or merge activity. header := args.Header() if !r.ContainsKeyRange(header.Key, header.EndKey) { err := proto.NewRangeKeyMismatchError(header.Key, header.EndKey, r.Desc()) reply.Header().SetGoError(err) return nil, err } // If a unittest filter was installed, check for an injected error; otherwise, continue. if TestingCommandFilter != nil && TestingCommandFilter(args, reply) { return nil, reply.Header().GoError() } var intents []proto.Intent switch tArgs := args.(type) { case *proto.GetRequest: intents = r.Get(batch, tArgs, reply.(*proto.GetResponse)) case *proto.PutRequest: r.Put(batch, ms, tArgs, reply.(*proto.PutResponse)) case *proto.ConditionalPutRequest: r.ConditionalPut(batch, ms, tArgs, reply.(*proto.ConditionalPutResponse)) case *proto.IncrementRequest: r.Increment(batch, ms, tArgs, reply.(*proto.IncrementResponse)) case *proto.DeleteRequest: r.Delete(batch, ms, tArgs, reply.(*proto.DeleteResponse)) case *proto.DeleteRangeRequest: r.DeleteRange(batch, ms, tArgs, reply.(*proto.DeleteRangeResponse)) case *proto.ScanRequest: intents = r.Scan(batch, tArgs, reply.(*proto.ScanResponse)) case *proto.EndTransactionRequest: r.EndTransaction(batch, ms, tArgs, reply.(*proto.EndTransactionResponse)) case *proto.InternalRangeLookupRequest: intents = r.InternalRangeLookup(batch, tArgs, reply.(*proto.InternalRangeLookupResponse)) case *proto.InternalHeartbeatTxnRequest: r.InternalHeartbeatTxn(batch, ms, tArgs, reply.(*proto.InternalHeartbeatTxnResponse)) case *proto.InternalGCRequest: r.InternalGC(batch, ms, tArgs, reply.(*proto.InternalGCResponse)) case *proto.InternalPushTxnRequest: r.InternalPushTxn(batch, ms, tArgs, reply.(*proto.InternalPushTxnResponse)) case *proto.InternalResolveIntentRequest: r.InternalResolveIntent(batch, ms, tArgs, reply.(*proto.InternalResolveIntentResponse)) case *proto.InternalMergeRequest: r.InternalMerge(batch, ms, tArgs, reply.(*proto.InternalMergeResponse)) case *proto.InternalTruncateLogRequest: r.InternalTruncateLog(batch, ms, tArgs, reply.(*proto.InternalTruncateLogResponse)) case *proto.InternalLeaderLeaseRequest: r.InternalLeaderLease(batch, ms, tArgs, reply.(*proto.InternalLeaderLeaseResponse)) default: return nil, util.Errorf("unrecognized command %s", args.Method()) } if log.V(2) { log.Infof("executed %s command %+v: %+v", args.Method(), args, reply) } // Update the node clock with the serviced request. This maintains a // high water mark for all ops serviced, so that received ops // without a timestamp specified are guaranteed one higher than any // op already executed for overlapping keys. r.rm.Clock().Update(header.Timestamp) // Propagate the request timestamp (which may have changed). reply.Header().Timestamp = header.Timestamp err := reply.Header().GoError() // A ReadWithinUncertaintyIntervalError contains the timestamp of the value // that provoked the conflict. However, we forward the timestamp to the // node's time here. The reason is that the caller (which is always // transactional when this error occurs) in our implementation wants to // use this information to extract a timestamp after which reads from // the nodes are causally consistent with the transaction. This allows // the node to be classified as without further uncertain reads for the // remainder of the transaction. // See the comment on proto.Transaction.CertainNodes. if tErr, ok := reply.Header().GoError().(*proto.ReadWithinUncertaintyIntervalError); ok && tErr != nil { // Note that we can use this node's clock (which may be different from // other replicas') because this error attaches the existing timestamp // to the node itself when retrying. tErr.ExistingTimestamp.Forward(r.rm.Clock().Now()) } // Return the error (if any) set in the reply. return intents, err }