// Add the specified timestamp to the cache as covering the range of // keys from start to end. If end is nil, the range covers the start // key only. txnID is nil for no transaction. readOnly specifies // whether the command adding this timestamp was read-only or not. func (tc *TimestampCache) Add(start, end roachpb.Key, timestamp roachpb.Timestamp, txnID []byte, readOnly bool) { // This gives us a memory-efficient end key if end is empty. if len(end) == 0 { end = start.Next() start = end[:len(start)] } if tc.latest.Less(timestamp) { tc.latest = timestamp } // Only add to the cache if the timestamp is more recent than the // low water mark. if tc.lowWater.Less(timestamp) { // Check existing, overlapping entries. Remove superseded // entries or return without adding this entry if necessary. key := tc.cache.NewKey(start, end) for _, o := range tc.cache.GetOverlaps(start, end) { ce := o.Value.(cacheEntry) if ce.readOnly != readOnly { continue } if o.Key.Contains(key) && !ce.timestamp.Less(timestamp) { return // don't add this key; there's already a cache entry with >= timestamp. } else if key.Contains(o.Key) && !timestamp.Less(ce.timestamp) { tc.cache.Del(o.Key) // delete existing key; this cache entry supersedes. } } ce := cacheEntry{timestamp: timestamp, txnID: txnID, readOnly: readOnly} tc.cache.Add(key, ce) } }
// getCachedRangeDescriptorLocked is a helper function to retrieve the // descriptor of the range which contains the given key, if present in the // cache. It is assumed that the caller holds a read lock on rdc.rangeCacheMu. func (rdc *rangeDescriptorCache) getCachedRangeDescriptorLocked(key roachpb.Key, inclusive bool) ( rangeCacheKey, *roachpb.RangeDescriptor) { // The cache is indexed using the end-key of the range, but the // end-key is non-inclusive by default. var metaKey roachpb.Key if !inclusive { metaKey = keys.RangeMetaKey(key.Next()) } else { metaKey = keys.RangeMetaKey(key) } k, v, ok := rdc.rangeCache.Ceil(rangeCacheKey(metaKey)) if !ok { return nil, nil } metaEndKey := k.(rangeCacheKey) rd := v.(*roachpb.RangeDescriptor) // Check that key actually belongs to the range. if !rd.ContainsKey(key) { // The key is the EndKey and we're inclusive, so just return the range descriptor. if inclusive && key.Equal(rd.EndKey) { return metaEndKey, rd } return nil, nil } // The key is the StartKey, but we're inclusive and thus need to return the // previous range descriptor, but it is not in the cache yet. if inclusive && key.Equal(rd.StartKey) { return nil, nil } return metaEndKey, rd }
// addKeyRange adds the specified key range to the interval cache, // taking care not to add this range if existing entries already // completely cover the range. func (tm *txnMetadata) addKeyRange(start, end roachpb.Key) { // This gives us a memory-efficient end key if end is empty. // The most common case for keys in the intents interval map // is for single keys. However, the interval cache requires // a non-empty interval, so we create two key slices which // share the same underlying byte array. if len(end) == 0 { end = start.Next() start = end[:len(start)] } key := tm.keys.MakeKey(start, end) for _, o := range tm.keys.GetOverlaps(key.Start, key.End) { if o.Key.Contains(key) { return } else if key.Contains(*o.Key) { tm.keys.Del(o.Key) } } // Since no existing key range fully covered this range, add it now. The // strange assignment to pkey makes sure we delay the heap allocation until // we know it is necessary. alloc := struct { key cache.IntervalKey entry cache.Entry }{key: key} alloc.entry.Key = &alloc.key tm.keys.AddEntry(&alloc.entry) }
// addKeyRange adds the specified key range to the range group, // taking care not to add this range if existing entries already // completely cover the range. func addKeyRange(keys interval.RangeGroup, start, end roachpb.Key) { // This gives us a memory-efficient end key if end is empty. // The most common case for keys in the intents interval map // is for single keys. However, the range group requires // a non-empty interval, so we create two key slices which // share the same underlying byte array. if len(end) == 0 { end = start.Next() start = end[:len(start)] } keyR := interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), } keys.Add(keyR) }
// GetMax returns the maximum read and write timestamps which overlap // the interval spanning from start to end. Cached timestamps matching // the specified txnID are not considered. If no part of the specified // range is overlapped by timestamps in the cache, the low water // timestamp is returned for both read and write timestamps. // // The txn ID prevents restarts with a pattern like: read("a"), // write("a"). The read adds a timestamp for "a". Then the write (for // the same transaction) would get that as the max timestamp and be // forced to increment it. This allows timestamps from the same txn // to be ignored. func (tc *TimestampCache) GetMax(start, end roachpb.Key, txnID []byte) (roachpb.Timestamp, roachpb.Timestamp) { if len(end) == 0 { end = start.Next() } maxR := tc.lowWater maxW := tc.lowWater for _, o := range tc.cache.GetOverlaps(start, end) { ce := o.Value.(*cacheValue) if ce.txnID == nil || txnID == nil || !roachpb.TxnIDEqual(txnID, ce.txnID) { if ce.readOnly && maxR.Less(ce.timestamp) { maxR = ce.timestamp } else if !ce.readOnly && maxW.Less(ce.timestamp) { maxW = ce.timestamp } } } return maxR, maxW }
func (tc *TimestampCache) getMax(start, end roachpb.Key, txnID *uuid.UUID, readOnly bool) roachpb.Timestamp { if len(end) == 0 { end = start.Next() } max := tc.lowWater cache := tc.wCache if readOnly { cache = tc.rCache } for _, o := range cache.GetOverlaps(start, end) { ce := o.Value.(*cacheValue) if ce.txnID == nil || txnID == nil || !roachpb.TxnIDEqual(txnID, ce.txnID) { if max.Less(ce.timestamp) { max = ce.timestamp } } } return max }
// MetaScanBounds returns the range [start,end) within which the desired meta // record can be found by means of an engine scan. The given key must be a // valid RangeMetaKey as defined by validateRangeMetaKey. func MetaScanBounds(key roachpb.Key) (roachpb.Key, roachpb.Key, error) { if err := validateRangeMetaKey(key); err != nil { return nil, nil, err } if key.Equal(Meta2KeyMax) { return nil, nil, NewInvalidRangeMetaKeyError("Meta2KeyMax can't be used as the key of scan", key) } if key.Equal(roachpb.KeyMin) { // Special case KeyMin: find the first entry in meta1. return Meta1Prefix, Meta1Prefix.PrefixEnd(), nil } if key.Equal(Meta1KeyMax) { // Special case Meta1KeyMax: this is the last key in Meta1, we don't want // to start at Next(). return key, Meta1Prefix.PrefixEnd(), nil } // Otherwise find the first entry greater than the given key in the same meta prefix. return key.Next(), roachpb.Key(key[:len(Meta1Prefix)]).PrefixEnd(), nil }
func (tc *timestampCache) getMax(start, end roachpb.Key, txnID *uuid.UUID, readTSCache bool) (hlc.Timestamp, bool) { if len(end) == 0 { end = start.Next() } var ok bool max := tc.lowWater cache := tc.wCache if readTSCache { cache = tc.rCache } for _, o := range cache.GetOverlaps(start, end) { ce := o.Value.(*cacheValue) if ce.txnID == nil || txnID == nil || !roachpb.TxnIDEqual(txnID, ce.txnID) { if max.Less(ce.timestamp) { ok = true max = ce.timestamp } } } return max, ok }
// addKeyRange adds the specified key range to the interval cache, // taking care not to add this range if existing entries already // completely cover the range. func (tm *txnMetadata) addKeyRange(start, end roachpb.Key) { // This gives us a memory-efficient end key if end is empty. // The most common case for keys in the intents interval map // is for single keys. However, the interval cache requires // a non-empty interval, so we create two key slices which // share the same underlying byte array. if len(end) == 0 { end = start.Next() start = end[:len(start)] } key := tm.keys.NewKey(start, end) for _, o := range tm.keys.GetOverlaps(start, end) { if o.Key.Contains(key) { return } else if key.Contains(o.Key) { tm.keys.Del(o.Key) } } // Since no existing key range fully covered this range, add it now. tm.keys.Add(key, nil) }
// MetaReverseScanBounds returns the range [start,end) within which the desired // meta record can be found by means of a reverse engine scan. The given key // must be a valid RangeMetaKey as defined by validateRangeMetaKey. func MetaReverseScanBounds(key roachpb.Key) (roachpb.Key, roachpb.Key, error) { if err := validateRangeMetaKey(key); err != nil { return nil, nil, err } if key.Equal(roachpb.KeyMin) || key.Equal(Meta1Prefix) { return nil, nil, NewInvalidRangeMetaKeyError("KeyMin and Meta1Prefix can't be used as the key of reverse scan", key) } if key.Equal(Meta2Prefix) { // Special case Meta2Prefix: this is the first key in Meta2, and the scan // interval covers all of Meta1. return Meta1Prefix, key.Next(), nil } // Otherwise find the first entry greater than the given key and find the last entry // in the same prefix. For MVCCReverseScan the endKey is exclusive, if we want to find // the range descriptor the given key specified,we need to set the key.Next() as the // MVCCReverseScan`s endKey. For example: // If we have ranges [a,f) and [f,z), then we'll have corresponding meta records // at f and z. If you're looking for the meta record for key f, then you want the // second record (exclusive in MVCCReverseScan), hence key.Next() below. return key[:len(Meta1Prefix)], key.Next(), nil }
// Add the specified timestamp to the cache as covering the range of // keys from start to end. If end is nil, the range covers the start // key only. txnID is nil for no transaction. readTSCache specifies // whether the command adding this timestamp should update the read // timestamp; false to update the write timestamp cache. func (tc *TimestampCache) Add(start, end roachpb.Key, timestamp roachpb.Timestamp, txnID *uuid.UUID, readTSCache bool) { // This gives us a memory-efficient end key if end is empty. if len(end) == 0 { end = start.Next() start = end[:len(start)] } if tc.latest.Less(timestamp) { tc.latest = timestamp } // Only add to the cache if the timestamp is more recent than the // low water mark. if tc.lowWater.Less(timestamp) { cache := tc.wCache if readTSCache { cache = tc.rCache } addRange := func(r interval.Range) { value := cacheValue{timestamp: timestamp, txnID: txnID} key := cache.MakeKey(r.Start, r.End) entry := makeCacheEntry(key, value) cache.AddEntry(entry) } r := interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), } // Check existing, overlapping entries and truncate/split/remove if // superseded and in the past. If existing entries are in the future, // subtract from the range/ranges that need to be added to cache. for _, o := range cache.GetOverlaps(r.Start, r.End) { cv := o.Value.(*cacheValue) sCmp := r.Start.Compare(o.Key.Start) eCmp := r.End.Compare(o.Key.End) if !timestamp.Less(cv.timestamp) { // The existing interval has a timestamp less than or equal to the new interval. // Compare interval ranges to determine how to modify existing interval. switch { case sCmp == 0 && eCmp == 0: // New and old are equal; replace old with new and avoid the need to insert new. // // New: ------------ // Old: ------------ // // New: ------------ *cv = cacheValue{timestamp: timestamp, txnID: txnID} cache.MoveToEnd(o.Entry) return case sCmp <= 0 && eCmp >= 0: // New contains or is equal to old; delete old. // // New: ------------ ------------ ------------ // Old: -------- or ---------- or ---------- // // Old: cache.DelEntry(o.Entry) case sCmp > 0 && eCmp < 0: // Old contains new; split up old into two. // // New: ---- // Old: ------------ // // Old: ---- ---- oldEnd := o.Key.End o.Key.End = r.Start key := cache.MakeKey(r.End, oldEnd) entry := makeCacheEntry(key, *cv) cache.AddEntryAfter(entry, o.Entry) case eCmp >= 0: // Left partial overlap; truncate old end. // // New: -------- -------- // Old: -------- or ------------ // // Old: ---- ---- o.Key.End = r.Start case sCmp <= 0: // Right partial overlap; truncate old start. // // New: -------- -------- // Old: -------- or ------------ // // Old: ---- ---- o.Key.Start = r.End default: panic(fmt.Sprintf("no overlap between %v and %v", o.Key.Range, r)) } } else { // The existing interval has a timestamp greater than the new interval. // Compare interval ranges to determine how to modify new interval before // adding it to the timestamp cache. switch { case sCmp >= 0 && eCmp <= 0: // Old contains or is equal to new; no need to add. // // Old: ----------- ----------- ----------- ----------- // New: ----- or ----------- or -------- or -------- // // New: return case sCmp < 0 && eCmp > 0: // New contains old; split up old into two. We can add the left piece // immediately because it is guaranteed to be before the rest of the // overlaps. // // Old: ------ // New: ------------ // // New: --- --- lr := interval.Range{Start: r.Start, End: o.Key.Start} addRange(lr) r.Start = o.Key.End case eCmp > 0: // Left partial overlap; truncate new start. // // Old: -------- -------- // New: -------- or ------------ // // New: ---- ---- r.Start = o.Key.End case sCmp < 0: // Right partial overlap; truncate new end. // // Old: -------- -------- // New: -------- or ------------ // // New: ---- ---- r.End = o.Key.Start default: panic(fmt.Sprintf("no overlap between %v and %v", o.Key.Range, r)) } } } addRange(r) } }
// fetch retrieves spans from the kv func (f *kvFetcher) fetch() error { batchSize := f.getBatchSize() b := &client.Batch{} b.Header.MaxScanResults = batchSize var resumeKey roachpb.Key if len(f.kvs) > 0 { resumeKey = f.kvs[len(f.kvs)-1].Key // To resume forward scans we will set the (inclusive) scan start to the Next of the last // received key. To resume reverse scans we will set the (exclusive) scan end to the last // received key. if !f.reverse { resumeKey = resumeKey.Next() } } atEnd := true if !f.reverse { for i := 0; i < len(f.spans); i++ { start := f.spans[i].Start if resumeKey != nil { if resumeKey.Compare(f.spans[i].End) >= 0 { // We are resuming from a key after this span. continue } if resumeKey.Compare(start) > 0 { // We are resuming from a key inside this span. // In this case we should technically reduce the max count for the span; but // since this count is only an optimization it's not incorrect to retrieve more // keys for the span. start = resumeKey } } atEnd = false b.Scan(start, f.spans[i].End, f.spans[i].Count) } } else { for i := len(f.spans) - 1; i >= 0; i-- { end := f.spans[i].End if resumeKey != nil { if resumeKey.Compare(f.spans[i].Start) <= 0 { // We are resuming from a key before this span. continue } if resumeKey.Compare(end) < 0 { // We are resuming from a key inside this span. // In this case we should technically reduce the max count for the span; but // since this count is only an optimization it's not incorrect to retrieve more // keys for the span. end = resumeKey } } atEnd = false b.ReverseScan(f.spans[i].Start, end, f.spans[i].Count) } } if atEnd { // The last scan happened to finish just at the end of the last span. f.kvs = nil f.fetchEnd = true return nil } if err := f.txn.Run(b); err != nil { return err } if f.kvs == nil { numResults := 0 for _, result := range b.Results { numResults += len(result.Rows) } f.kvs = make([]client.KeyValue, 0, numResults) } else { f.kvs = f.kvs[:0] } for _, result := range b.Results { f.kvs = append(f.kvs, result.Rows...) } f.batchIdx++ f.totalFetched += int64(len(f.kvs)) f.kvIndex = 0 if int64(len(f.kvs)) < batchSize { f.fetchEnd = true } // TODO(radu): We should fetch the next chunk in the background instead of waiting for the next // call to fetch(). We can use a pool of workers to issue the KV ops which will also limit the // total number of fetches that happen in parallel (and thus the amount of resources we use). return nil }