func (db *testMetadataDB) getMetadata(key engine.Key) []proto.RangeDescriptor { response := make([]proto.RangeDescriptor, 0, 3) for i := 0; i < 3; i++ { v := db.data.Ceil(testMetadataNode{ &proto.RangeDescriptor{ EndKey: engine.NextKey(key), }, }) if v == nil { break } response = append(response, *(v.(testMetadataNode).RangeDescriptor)) key = engine.NextKey(response[i].EndKey) } return response }
// AddWrite adds a pending write which affects the specified key range. // If end is nil, it is set to start, meaning the write affects a single // key. The returned interface is the key for the write and must be // re-supplied on subsequent invocation of RemoveWrite(). // // AddWrite is invoked as a mutating command is added to the queue of // Raft proposals. As multiple commands may be in the proposal state, // writes may overlap. func (rq *ReadQueue) AddWrite(start, end engine.Key) interface{} { if end == nil { end = engine.NextKey(start) } key := rq.cache.NewKey(rangeKey(start), rangeKey(end)) rq.cache.Add(key, &write{}) return key }
// AddRead adds a read to the queue for the specified key range. If // end is nil, end is set to start, meaning the read affects a single // key. The supplied WaitGroup is incremented according to the number // of pending writes with key(s) overlapping the key (start==end) or // key range [start, end). The caller should call wg.Wait() to wait // for confirmation that all pending writes have completed or failed. func (rq *ReadQueue) AddRead(start, end engine.Key, wg *sync.WaitGroup) { if end == nil { end = engine.NextKey(start) } for _, w := range rq.cache.GetOverlaps(rangeKey(start), rangeKey(end)) { w := w.(*write) w.pending = append(w.pending, wg) wg.Add(1) } }
// GetMax returns the maximum read timestamp covering any part of the // interval spanning from start to end keys. If no part of the // specified range is overlapped by read timestamps in the cache, the // high water timestamp is returned. func (rtc *ReadTimestampCache) GetMax(start, end engine.Key) hlc.HLTimestamp { if end == nil { end = engine.NextKey(start) } max := rtc.highWater for _, v := range rtc.cache.GetOverlaps(rangeKey(start), rangeKey(end)) { ts := v.(hlc.HLTimestamp) if max.Less(ts) { max = ts } } return max }
// Add the specified timestamp to the cache as covering the range of // keys from start to end. If end is nil, the range covers the start // key only. func (tc *TimestampCache) Add(start, end engine.Key, timestamp proto.Timestamp) { if end == nil { end = engine.NextKey(start) } if tc.latest.Less(timestamp) { tc.latest = timestamp } // Only add to the cache if the timestamp is more recent than the // high water mark. if tc.highWater.Less(timestamp) { tc.cache.Add(tc.cache.NewKey(rangeKey(start), rangeKey(end)), timestamp) } }
// InternalRangeLookup is used to look up RangeDescriptors - a RangeDescriptor // is a metadata structure which describes the key range and replica locations // of a distinct range in the cluster. // // RangeDescriptors are stored as values in the cockroach cluster's key-value // store. However, they are always stored using special "Range Metadata keys", // which are "ordinary" keys with a special prefix appended. The Range Metadata // Key for an ordinary key can be generated with the `engine.RangeMetaKey(key)` // function. The RangeDescriptor for the range which contains a given key can be // retrieved by generating its Range Metadata Key and dispatching it to // InternalRangeLookup. // // Note that the Range Metadata Key sent to InternalRangeLookup is NOT the key // at which the desired RangeDescriptor is stored. Instead, this method returns // the RangeDescriptor stored at the _lowest_ existing key which is _greater_ // than the given key. The returned RangeDescriptor will thus contain the // ordinary key which was originally used to generate the Range Metadata Key // sent to InternalRangeLookup. // // This method has an important optimization: instead of just returning the // request RangeDescriptor, it also returns a slice of additional range // descriptors immediately consecutive to the desired RangeDescriptor. This is // intended to serve as a sort of caching pre-fetch, so that the requesting // nodes can aggressively cache RangeDescriptors which are likely to be desired // by their current workload. func (r *Range) InternalRangeLookup(args *InternalRangeLookupRequest, reply *InternalRangeLookupResponse) { if err := engine.ValidateRangeMetaKey(args.Key); err != nil { reply.Error = err return } rangeCount := int64(args.MaxRanges) if rangeCount < 1 { reply.Error = util.Errorf( "Range lookup specified invalid maximum range count %d: must be > 0", rangeCount) return } // We want to search for the metadata key just greater than args.Key. Scan // for both the requested key and the keys immediately afterwards, up to // MaxRanges. metaPrefix := args.Key[:len(engine.KeyMeta1Prefix)] nextKey := engine.NextKey(args.Key) kvs, err := r.engine.Scan(nextKey, engine.PrefixEndKey(metaPrefix), rangeCount) if err != nil { reply.Error = err return } // The initial key must have the same metadata level prefix as we queried. if len(kvs) == 0 { // At this point the range has been verified to contain the requested // key, but no matching results were returned from the scan. This could // indicate a very bad system error, but for now we will just treat it // as a retryable Key Mismatch error. reply.Error = NewRangeKeyMismatchError(args.Key, args.Key, r.Meta) log.Errorf("InternalRangeLookup dispatched to correct range, but no matching RangeDescriptor was found. %s", reply.Error.Error()) return } // Decode all scanned range descriptors, stopping if a range is encountered // which does not have the same metadata prefix as the queried key. rds := make([]*RangeDescriptor, 0, len(kvs)) for i := range kvs { rds = append(rds, &RangeDescriptor{}) if err = gob.NewDecoder(bytes.NewBuffer(kvs[i].Value)).Decode(rds[i]); err != nil { reply.Error = err return } } reply.Ranges = rds return }
// InternalRangeLookup looks up the metadata info for the given args.Key. // args.Key should be a metadata key, which are of the form "\0\0meta[12]<encoded_key>". func (r *Range) InternalRangeLookup(args *InternalRangeLookupRequest, reply *InternalRangeLookupResponse) { if !bytes.HasPrefix(args.Key, engine.KeyMetaPrefix) { reply.Error = util.Errorf("invalid metadata key: %q", args.Key) return } // Validate that key is not outside the range. A range ends just // before its Meta.EndKey. if !args.Key.Less(r.Meta.EndKey) { reply.Error = util.Errorf("key outside the range %v with end key %q", r.Meta.RangeID, r.Meta.EndKey) return } // We want to search for the metadata key just greater than args.Key. nextKey := engine.NextKey(args.Key) kvs, err := r.engine.Scan(nextKey, engine.KeyMax, 1) if err != nil { reply.Error = err return } // We should have gotten the key with the same metadata level prefix as we queried. metaPrefix := args.Key[0:len(engine.KeyMeta1Prefix)] if len(kvs) != 1 || !bytes.HasPrefix(kvs[0].Key, metaPrefix) { reply.Error = util.Errorf("key not found in range %v", r.Meta.RangeID) return } if err = gob.NewDecoder(bytes.NewBuffer(kvs[0].Value)).Decode(&reply.Range); err != nil { reply.Error = err return } if args.Key.Less(reply.Range.StartKey) { // args.Key doesn't belong to this range. We are perhaps searching the wrong node? reply.Error = util.Errorf("no range found for key %q in range: %+v", args.Key, r.Meta) return } reply.EndKey = kvs[0].Key }
// Add the specified read timestamp to the cache as covering the range of // keys from start to end. If end is nil, the range covers the start // key only. func (rtc *ReadTimestampCache) Add(start, end engine.Key, timestamp hlc.HLTimestamp) { if end == nil { end = engine.NextKey(start) } rtc.cache.Add(rtc.cache.NewKey(rangeKey(start), rangeKey(end)), timestamp) }
// Init starts the engine, sets the GC and reads the StoreIdent. func (s *Store) Init() error { // Close store for idempotency. s.Close() // Start engine and set garbage collector. if err := s.engine.Start(); err != nil { return err } // Create ID allocators. s.raftIDAlloc = NewIDAllocator(engine.KeyRaftIDGenerator, s.db, 2, raftIDAllocCount) s.rangeIDAlloc = NewIDAllocator(engine.KeyRangeIDGenerator, s.db, 2, rangeIDAllocCount) // GCTimeouts method is called each time an engine compaction is // underway. It sets minimum timeouts for transaction records and // response cache entries. s.engine.SetGCTimeouts(func() (minTxnTS, minRCacheTS int64) { now := s.clock.Now() minTxnTS = 0 // disable GC of transactions until we know minimum write intent age minRCacheTS = now.WallTime - GCResponseCacheExpiration.Nanoseconds() return }) // Read store ident and return a not-bootstrapped error if necessary. ok, err := engine.GetProto(s.engine, engine.KeyLocalIdent, &s.Ident) if err != nil { return err } else if !ok { return &NotBootstrappedError{} } s.mu.Lock() defer s.mu.Unlock() start := engine.KeyLocalRangeMetadataPrefix end := engine.PrefixEndKey(start) const rows = 64 for { kvs, err := s.engine.Scan(start, end, rows) if err != nil { return err } for _, kv := range kvs { var meta proto.RangeMetadata if err := gogoproto.Unmarshal(kv.Value, &meta); err != nil { return err } rng := NewRange(&meta, s.clock, s.engine, s.allocator, s.gossip, s) rng.Start() s.ranges[meta.RangeID] = rng s.rangesByKey = append(s.rangesByKey, rng) } if len(kvs) < rows { break } start = engine.NextKey(kvs[rows-1].Key) } // Ensure that ranges are sorted. sort.Sort(s.rangesByKey) return nil }