// getDescriptors looks up the range descriptor to use for a query over the // key range [from,to), with the given lookupOptions. The range descriptor // which contains the range in which the request should start its query is // returned first; the returned bool is true in case the given range reaches // outside the first descriptor. // In case either of the descriptors is discovered stale, the returned closure // should be called; it evicts the cache appropriately. // Note that `from` and `to` are not necessarily Key and EndKey from a // RequestHeader; it's assumed that they've been translated to key addresses // already (via KeyAddress). func (ds *DistSender) getDescriptors(from, to roachpb.Key, options lookupOptions) (*roachpb.RangeDescriptor, bool, func(), *roachpb.Error) { var desc *roachpb.RangeDescriptor var err error var descKey roachpb.Key if !options.useReverseScan { descKey = from } else { descKey = to } desc, err = ds.rangeCache.LookupRangeDescriptor(descKey, options) if err != nil { return nil, false, nil, roachpb.NewError(err) } // Checks whether need to get next range descriptor. If so, returns true. needAnother := func(desc *roachpb.RangeDescriptor, isReverse bool) bool { if isReverse { return from.Less(desc.StartKey) } return desc.EndKey.Less(to) } evict := func() { ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, options.useReverseScan) } return desc, needAnother(desc, options.useReverseScan), evict, nil }
// Add the specified timestamp to the cache as covering the range of // keys from start to end. If end is nil, the range covers the start // key only. txnID is nil for no transaction. readOnly specifies // whether the command adding this timestamp was read-only or not. func (tc *TimestampCache) Add(start, end roachpb.Key, timestamp roachpb.Timestamp, txnID []byte, readOnly bool) { // This gives us a memory-efficient end key if end is empty. if len(end) == 0 { end = start.Next() start = end[:len(start)] } if tc.latest.Less(timestamp) { tc.latest = timestamp } // Only add to the cache if the timestamp is more recent than the // low water mark. if tc.lowWater.Less(timestamp) { // Check existing, overlapping entries. Remove superseded // entries or return without adding this entry if necessary. key := tc.cache.NewKey(start, end) for _, o := range tc.cache.GetOverlaps(start, end) { ce := o.Value.(cacheEntry) if ce.readOnly != readOnly { continue } if o.Key.Contains(key) && !ce.timestamp.Less(timestamp) { return // don't add this key; there's already a cache entry with >= timestamp. } else if key.Contains(o.Key) && !timestamp.Less(ce.timestamp) { tc.cache.Del(o.Key) // delete existing key; this cache entry supersedes. } } ce := cacheEntry{timestamp: timestamp, txnID: txnID, readOnly: readOnly} tc.cache.Add(key, ce) } }
// clearOverlappingCachedRangeDescriptors looks up and clears any // cache entries which overlap the specified key or descriptor. func (rdc *rangeDescriptorCache) clearOverlappingCachedRangeDescriptors(key, metaKey roachpb.Key, desc *roachpb.RangeDescriptor) { if desc.StartKey.Equal(desc.EndKey) { // True for some unittests. return } // Clear out any descriptors which subsume the key which we're going // to cache. For example, if an existing KeyMin->KeyMax descriptor // should be cleared out in favor of a KeyMin->"m" descriptor. k, v, ok := rdc.rangeCache.Ceil(rangeCacheKey(metaKey)) if ok { descriptor := v.(*roachpb.RangeDescriptor) if !key.Less(descriptor.StartKey) && !descriptor.EndKey.Less(key) { if log.V(1) { log.Infof("clearing overlapping descriptor: key=%s desc=%s", k, descriptor) } rdc.rangeCache.Del(k.(rangeCacheKey)) } } // Also clear any descriptors which are subsumed by the one we're // going to cache. This could happen on a merge (and also happens // when there's a lot of concurrency). Iterate from the range meta key // after RangeMetaKey(desc.StartKey) to the range meta key for desc.EndKey. rdc.rangeCache.DoRange(func(k, v interface{}) { if log.V(1) { log.Infof("clearing subsumed descriptor: key=%s desc=%s", k, v.(*roachpb.RangeDescriptor)) } rdc.rangeCache.Del(k.(rangeCacheKey)) }, rangeCacheKey(keys.RangeMetaKey(desc.StartKey).Next()), rangeCacheKey(keys.RangeMetaKey(desc.EndKey))) }
// prev gives the right boundary of the union of all requests which don't // affect keys larger than the given key. // TODO(tschottdorf): again, better on BatchRequest itself, but can't pull // 'keys' into 'proto'. func prev(ba roachpb.BatchRequest, k roachpb.Key) roachpb.Key { candidate := roachpb.KeyMin for _, union := range ba.Requests { h := union.GetInner().Header() addr := keys.KeyAddress(h.Key) eAddr := keys.KeyAddress(h.EndKey) if len(eAddr) == 0 { // Can probably avoid having to compute Next() here if // we're in the mood for some more complexity. eAddr = addr.Next() } if !eAddr.Less(k) { if !k.Less(addr) { // Range contains k, so won't be able to go lower. return k } // Range is disjoint from [KeyMin,k). continue } // We want the largest surviving candidate. if candidate.Less(addr) { candidate = addr } } return candidate }
// prettyPrintInternal parse key with prefix in keyDict, // if the key don't march any prefix in keyDict, return its byte value with quotation and false, // or else return its human readable value and true. func prettyPrintInternal(key roachpb.Key) (string, bool) { var buf bytes.Buffer for _, k := range keyDict { if key.Compare(k.start) >= 0 && (k.end == nil || key.Compare(k.end) <= 0) { buf.WriteString(k.name) if k.end != nil && k.end.Compare(key) == 0 { buf.WriteString("/Max") return buf.String(), true } hasPrefix := false for _, e := range k.entries { if bytes.HasPrefix(key, e.prefix) { hasPrefix = true key = key[len(e.prefix):] fmt.Fprintf(&buf, "%s%s", e.name, e.ppFunc(key)) break } } if !hasPrefix { key = key[len(k.start):] fmt.Fprintf(&buf, "/%q", []byte(key)) } return buf.String(), true } } return fmt.Sprintf("%q", []byte(key)), false }
// addKeyRange adds the specified key range to the interval cache, // taking care not to add this range if existing entries already // completely cover the range. func (tm *txnMetadata) addKeyRange(start, end roachpb.Key) { // This gives us a memory-efficient end key if end is empty. // The most common case for keys in the intents interval map // is for single keys. However, the interval cache requires // a non-empty interval, so we create two key slices which // share the same underlying byte array. if len(end) == 0 { end = start.Next() start = end[:len(start)] } key := tm.keys.MakeKey(start, end) for _, o := range tm.keys.GetOverlaps(key.Start, key.End) { if o.Key.Contains(key) { return } else if key.Contains(*o.Key) { tm.keys.Del(o.Key) } } // Since no existing key range fully covered this range, add it now. The // strange assignment to pkey makes sure we delay the heap allocation until // we know it is necessary. alloc := struct { key cache.IntervalKey entry cache.Entry }{key: key} alloc.entry.Key = &alloc.key tm.keys.AddEntry(&alloc.entry) }
// Encodes datum at the end of key, using direction `dir` for the encoding. // The key is a span end key, which is exclusive, but `val` needs to // be inclusive. So if datum is the last end constraint, we transform it accordingly. func encodeInclusiveEndValue( key roachpb.Key, datum parser.Datum, dir encoding.Direction, isLastEndConstraint bool) roachpb.Key { // Since the end of a span is exclusive, if the last constraint is an // inclusive one, we might need to make the key exclusive by applying a // PrefixEnd(). We normally avoid doing this by transforming "a = x" to // "a = x±1" for the last end constraint, depending on the encoding direction // (since this keeps the key nice and pretty-printable). // However, we might not be able to do the ±1. needExclusiveKey := false if isLastEndConstraint { if dir == encoding.Ascending { if datum.IsMax() { needExclusiveKey = true } else { datum = datum.Next() } } else { if datum.IsMin() || !datum.HasPrev() { needExclusiveKey = true } else { datum = datum.Prev() } } } key, pErr := encodeTableKey(key, datum, dir) if pErr != nil { panic(pErr) } if needExclusiveKey { key = key.PrefixEnd() } return key }
// GetIndex searches the kv list for 'key' and returns its index if found. func (s SystemConfig) GetIndex(key roachpb.Key) (int, bool) { l := len(s.Values) index := sort.Search(l, func(i int) bool { return bytes.Compare(s.Values[i].Key, key) >= 0 }) if index == l || !key.Equal(s.Values[index].Key) { return 0, false } return index, true }
// prettyKey pretty-prints the specified key, skipping over the first `skip` // fields. The pretty printed key looks like: // // /Table/<tableID>/<indexID>/... // // We always strip off the /Table prefix and then `skip` more fields. Note that // this assumes that the fields themselves do not contain '/', but that is // currently true for the fields we care about stripping (the table and index // ID). func prettyKey(key roachpb.Key, skip int) string { p := key.String() for i := 0; i <= skip; i++ { n := strings.IndexByte(p[1:], '/') if n == -1 { return "" } p = p[n+1:] } return p }
// ComputeSplitKeys takes a start and end key and returns an array of keys // at which to split the span [start, end). // The only required splits are at each user table prefix. func (s *SystemConfig) ComputeSplitKeys(startKey, endKey roachpb.Key) []roachpb.Key { if TestingDisableTableSplits { return nil } tableStart := roachpb.Key(keys.UserTableDataMin) if !tableStart.Less(endKey) { // This range is before the user tables span: no required splits. return nil } startID, ok := ObjectIDForKey(startKey) if !ok || startID <= keys.MaxReservedDescID { // The start key is either: // - not part of the structured data span // - part of the system span // In either case, start looking for splits at the first ID usable // by the user data span. startID = keys.MaxReservedDescID + 1 } else { // The start key is either already a split key, or after the split // key for its ID. We can skip straight to the next one. startID++ } // Find the largest object ID. // We can't keep splitting until we reach endKey as it could be roachpb.KeyMax. endID, err := s.GetLargestObjectID() if err != nil { log.Errorf("unable to determine largest object ID from system config: %s", err) return nil } // Build key prefixes for sequential table IDs until we reach endKey. var splitKeys roachpb.KeySlice var key roachpb.Key // endID could be smaller than startID if we don't have user tables. for id := startID; id <= endID; id++ { key = keys.MakeTablePrefix(id) // Skip if the range starts on a split key. if !startKey.Less(key) { continue } // Handle the case where EndKey is already a table prefix. if !key.Less(endKey) { break } splitKeys = append(splitKeys, key) } return splitKeys }
// GetIndex searches the kv list for 'key' and returns its index if found. func (s *SystemConfig) GetIndex(key roachpb.Key) (int, bool) { if s == nil { return 0, false } l := len(s.Values) index := sort.Search(l, func(i int) bool { return !s.Values[i].Key.Less(key) }) if index == l || !key.Equal(s.Values[index].Key) { return 0, false } return index, true }
// verifyBinarySearchTree checks to ensure that all keys to the left of the root // node are less than it, and all nodes to the right of the root node are // greater than it. It recursively walks the tree to perform this same check. func verifyBinarySearchTree(t *testing.T, nodes map[string]roachpb.RangeTreeNode, testName string, node *roachpb.RangeTreeNode, keyMin, keyMax roachpb.Key) { if node == nil { return } if !node.Key.Less(keyMax) { t.Errorf("%s: Failed Property BST - The key %s is not less than %s.", testName, node.Key, keyMax) } // We need the extra check since roachpb.KeyMin is actually a range start key. if !keyMin.Less(node.Key) && !node.Key.Equal(roachpb.KeyMin) { t.Errorf("%s: Failed Property BST - The key %s is not greater than %s.", testName, node.Key, keyMin) } left, right := getLeftAndRight(t, nodes, testName, node) verifyBinarySearchTree(t, nodes, testName, left, keyMin, node.Key) verifyBinarySearchTree(t, nodes, testName, right, node.Key, keyMax) }
// addKeyRange adds the specified key range to the range group, // taking care not to add this range if existing entries already // completely cover the range. func addKeyRange(keys interval.RangeGroup, start, end roachpb.Key) { // This gives us a memory-efficient end key if end is empty. // The most common case for keys in the intents interval map // is for single keys. However, the range group requires // a non-empty interval, so we create two key slices which // share the same underlying byte array. if len(end) == 0 { end = start.Next() start = end[:len(start)] } keyR := interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), } keys.Add(keyR) }
// TODO(dt): Batch checks of many rows. func (f baseFKHelper) check(values parser.DTuple) (parser.DTuple, error) { var key roachpb.Key if values != nil { keyBytes, _, err := sqlbase.EncodeIndexKey(f.searchIdx, f.ids, values, f.searchPrefix) if err != nil { return nil, err } key = roachpb.Key(keyBytes) } else { key = roachpb.Key(f.searchPrefix) } spans := sqlbase.Spans{sqlbase.Span{Start: key, End: key.PrefixEnd()}} if err := f.rf.StartScan(f.txn, spans, 1); err != nil { return nil, err } return f.rf.NextRow() }
// GetMax returns the maximum read and write timestamps which overlap // the interval spanning from start to end. Cached timestamps matching // the specified txnID are not considered. If no part of the specified // range is overlapped by timestamps in the cache, the low water // timestamp is returned for both read and write timestamps. // // The txn ID prevents restarts with a pattern like: read("a"), // write("a"). The read adds a timestamp for "a". Then the write (for // the same transaction) would get that as the max timestamp and be // forced to increment it. This allows timestamps from the same txn // to be ignored. func (tc *TimestampCache) GetMax(start, end roachpb.Key, txnID []byte) (roachpb.Timestamp, roachpb.Timestamp) { if len(end) == 0 { end = start.Next() } maxR := tc.lowWater maxW := tc.lowWater for _, o := range tc.cache.GetOverlaps(start, end) { ce := o.Value.(*cacheValue) if ce.txnID == nil || txnID == nil || !roachpb.TxnIDEqual(txnID, ce.txnID) { if ce.readOnly && maxR.Less(ce.timestamp) { maxR = ce.timestamp } else if !ce.readOnly && maxW.Less(ce.timestamp) { maxW = ce.timestamp } } } return maxR, maxW }
// ObjectIDForKey returns the object ID (table or database) for 'key', // or (_, false) if not within the structured key space. func ObjectIDForKey(key roachpb.Key) (uint32, bool) { if key.Equal(roachpb.KeyMax) { return 0, false } if key.Equal(keys.TableDataPrefix) { // TODO(marc): this should eventually return SystemDatabaseID. return 0, false } remaining := bytes.TrimPrefix(key, keys.TableDataPrefix) if len(remaining) == len(key) { // TrimPrefix returns the input untouched if the prefix doesn't match. return 0, false } // Consume first encoded int. _, id64, err := encoding.DecodeUvarint(remaining) return uint32(id64), err == nil }
func (tc *TimestampCache) getMax(start, end roachpb.Key, txnID *uuid.UUID, readTSCache bool) roachpb.Timestamp { if len(end) == 0 { end = start.ShallowNext() } max := tc.lowWater cache := tc.wCache if readTSCache { cache = tc.rCache } for _, o := range cache.GetOverlaps(start, end) { ce := o.Value.(*cacheValue) if ce.txnID == nil || txnID == nil || !roachpb.TxnIDEqual(txnID, ce.txnID) { if max.Less(ce.timestamp) { max = ce.timestamp } } } return max }
// next gives the left boundary of the union of all requests which don't // affect keys less than the given key. // TODO(tschottdorf): again, better on BatchRequest itself, but can't pull // 'keys' into 'proto'. func next(ba roachpb.BatchRequest, k roachpb.Key) roachpb.Key { candidate := roachpb.KeyMax for _, union := range ba.Requests { h := union.GetInner().Header() addr := keys.KeyAddress(h.Key) if addr.Less(k) { if eAddr := keys.KeyAddress(h.EndKey); k.Less(eAddr) { // Starts below k, but continues beyond. Need to stay at k. return k } // Affects only [KeyMin,k). continue } // We want the smallest of the surviving candidates. if addr.Less(candidate) { candidate = addr } } return candidate }
// PrettyPrint prints the key in a human readable format: // // Key's Format Key's Value // /Local/... "\x01"+... // /Store/... "\x01s"+... // /RangeID/... "\x01s"+[rangeid] // /[rangeid]/AbortCache/[id] "\x01s"+[rangeid]+"abc-"+[id] // /[rangeid]/RaftLeaderLease "\x01s"+[rangeid]+"rfll" // /[rangeid]/RaftTombstone "\x01s"+[rangeid]+"rftb" // /[rangeid]/RaftHardState "\x01s"+[rangeid]+"rfth" // /[rangeid]/RaftAppliedIndex "\x01s"+[rangeid]+"rfta" // /[rangeid]/RaftLog/logIndex:[logIndex] "\x01s"+[rangeid]+"rftl"+[logIndex] // /[rangeid]/RaftTruncatedState "\x01s"+[rangeid]+"rftt" // /[rangeid]/RaftLastIndex "\x01s"+[rangeid]+"rfti" // /[rangeid]/RangeLastReplicaGCTimestamp "\x01s"+[rangeid]+"rlrt" // /[rangeid]/RangeLastVerificationTimestamp "\x01s"+[rangeid]+"rlvt" // /[rangeid]/RangeStats "\x01s"+[rangeid]+"stat" // /Range/... "\x01k"+... // /RangeDescriptor/[key] "\x01k"+[key]+"rdsc" // /RangeTreeNode/[key] "\x01k"+[key]+"rtn-" // /Transaction/addrKey:[key]/id:[id] "\x01k"+[key]+"txn-"+[id] // /Local/Max "\x02" // // /Meta1/[key] "\x02"+[key] // /Meta2/[key] "\x03"+[key] // /System/... "\x04" // /StatusNode/[key] "\x04status-node-"+[key] // /System/Max "\x05" // // /Table/[key] [key] // // /Min "" // /Max "\xff\xff" func PrettyPrint(key roachpb.Key) string { for _, k := range constKeyDict { if key.Equal(k.value) { return k.name } } for _, k := range keyOfKeyDict { if bytes.HasPrefix(key, k.prefix) { key = key[len(k.prefix):] str, formatted := prettyPrintInternal(key) if formatted { return k.name + str } return k.name + "/" + str } } str, _ := prettyPrintInternal(key) return str }
// validateRangeMetaKey validates that the given key is a valid Range Metadata // key. This checks only the constraints common to forward and backwards scans: // correct prefix and not exceeding KeyMax. func validateRangeMetaKey(key roachpb.Key) error { // KeyMin is a valid key. if key.Equal(roachpb.KeyMin) { return nil } // Key must be at least as long as Meta1Prefix. if len(key) < len(Meta1Prefix) { return NewInvalidRangeMetaKeyError("too short", key) } prefix, body := roachpb.Key(key[:len(Meta1Prefix)]), roachpb.Key(key[len(Meta1Prefix):]) if !prefix.Equal(Meta2Prefix) && !prefix.Equal(Meta1Prefix) { return NewInvalidRangeMetaKeyError("not a meta key", key) } if roachpb.KeyMax.Less(body) { return NewInvalidRangeMetaKeyError("body of meta key range lookup is > KeyMax", key) } return nil }
// getCachedRangeDescriptorLocked is a helper function to retrieve the // descriptor of the range which contains the given key, if present in the // cache. It is assumed that the caller holds a read lock on rdc.rangeCacheMu. func (rdc *rangeDescriptorCache) getCachedRangeDescriptorLocked(key roachpb.Key, inclusive bool) ( rangeCacheKey, *roachpb.RangeDescriptor) { // The cache is indexed using the end-key of the range, but the // end-key is non-inclusive by default. var metaKey roachpb.Key if !inclusive { metaKey = keys.RangeMetaKey(key.Next()) } else { metaKey = keys.RangeMetaKey(key) } k, v, ok := rdc.rangeCache.Ceil(rangeCacheKey(metaKey)) if !ok { return nil, nil } metaEndKey := k.(rangeCacheKey) rd := v.(*roachpb.RangeDescriptor) // Check that key actually belongs to the range. if !rd.ContainsKey(key) { // The key is the EndKey and we're inclusive, so just return the range descriptor. if inclusive && key.Equal(rd.EndKey) { return metaEndKey, rd } return nil, nil } // The key is the StartKey, but we're inclusive and thus need to return the // previous range descriptor, but it is not in the cache yet. if inclusive && key.Equal(rd.StartKey) { return nil, nil } return metaEndKey, rd }
// MetaReverseScanBounds returns the range [start,end) within which the desired // meta record can be found by means of a reverse engine scan. The given key // must be a valid RangeMetaKey as defined by validateRangeMetaKey. func MetaReverseScanBounds(key roachpb.Key) (roachpb.Key, roachpb.Key, error) { if err := validateRangeMetaKey(key); err != nil { return nil, nil, err } if key.Equal(roachpb.KeyMin) || key.Equal(Meta1Prefix) { return nil, nil, NewInvalidRangeMetaKeyError("KeyMin and Meta1Prefix can't be used as the key of reverse scan", key) } if key.Equal(Meta2Prefix) { // Special case Meta2Prefix: this is the first key in Meta2, and the scan // interval covers all of Meta1. return Meta1Prefix, key.Next(), nil } // Otherwise find the first entry greater than the given key and find the last entry // in the same prefix. For MVCCReverseScan the endKey is exclusive, if we want to find // the range descriptor the given key specified,we need to set the key.Next() as the // MVCCReverseScan`s endKey. For example: // If we have ranges [a,f) and [f,z), then we'll have corresponding meta records // at f and z. If you're looking for the meta record for key f, then you want the // second record (exclusive in MVCCReverseScan), hence key.Next() below. return key[:len(Meta1Prefix)], key.Next(), nil }
// addKeyRange adds the specified key range to the interval cache, // taking care not to add this range if existing entries already // completely cover the range. func (tm *txnMetadata) addKeyRange(start, end roachpb.Key) { // This gives us a memory-efficient end key if end is empty. // The most common case for keys in the intents interval map // is for single keys. However, the interval cache requires // a non-empty interval, so we create two key slices which // share the same underlying byte array. if len(end) == 0 { end = start.Next() start = end[:len(start)] } key := tm.keys.NewKey(start, end) for _, o := range tm.keys.GetOverlaps(start, end) { if o.Key.Contains(key) { return } else if key.Contains(o.Key) { tm.keys.Del(o.Key) } } // Since no existing key range fully covered this range, add it now. tm.keys.Add(key, nil) }
// verifyBinarySearchTree checks to ensure that all keys to the left of the root // node are less than it, and all nodes to the right of the root node are // greater than it. It recursively walks the tree to perform this same check. func verifyBinarySearchTree(t *testing.T, tc *treeContext, testName string, node *roachpb.RangeTreeNode, keyMin, keyMax roachpb.Key) { if !node.Key.Less(keyMax) { t.Errorf("%s: Failed Property BST - The key %s is not less than %s.", testName, node.Key, keyMax) } if !keyMin.Less(node.Key) { t.Errorf("%s: Failed Property BST - The key %s is not greater than %s.", testName, node.Key, keyMin) } if node.LeftKey != nil { left, err := tc.getNode(node.LeftKey) if err != nil { t.Fatal(err) } verifyBinarySearchTree(t, tc, testName, left, keyMin, node.Key) } if node.RightKey != nil { right, err := tc.getNode(node.RightKey) if err != nil { t.Fatal(err) } verifyBinarySearchTree(t, tc, testName, right, node.Key, keyMax) } }
// PrettyPrint prints the key in a human readable format: // // Key's Format Key's Value // /Local/... "\x01"+... // /Store/... "\x01s"+... // /RangeID/... "\x01s"+[rangeid] // /[rangeid]/SequenceCache/[id]/seq:[seq] "\x01s"+[rangeid]+"res-"+[id]+[seq] // /[rangeid]/RaftLeaderLease "\x01s"+[rangeid]+"rfll" // /[rangeid]/RaftTombstone "\x01s"+[rangeid]+"rftb" // /[rangeid]/RaftHardState "\x01s"+[rangeid]+"rfth" // /[rangeid]/RaftAppliedIndex "\x01s"+[rangeid]+"rfta" // /[rangeid]/RaftLog/logIndex:[logIndex] "\x01s"+[rangeid]+"rftl"+[logIndex] // /[rangeid]/RaftTruncatedState "\x01s"+[rangeid]+"rftt" // /[rangeid]/RaftLastIndex "\x01s"+[rangeid]+"rfti" // /[rangeid]/RangeLastVerificationTimestamp "\x01s"+[rangeid]+"rlvt" // /[rangeid]/RangeStats "\x01s"+[rangeid]+"stat" // /Range/... "\x01k"+... // /RangeDescriptor/[key] "\x01k"+[key]+"rdsc" // /RangeTreeNode/[key] "\x01k"+[key]+"rtn-" // /Transaction/addrKey:[key]/id:[id] "\x01k"+[key]+"txn-"+[id] // /Local/Max "\x02" // // /Meta1/[key] "\x02"+[key] // /Meta2/[key] "\x03"+[key] // /System/... "\x04" // /StatusStore/[key] "\x04status-store-"+[key] // /StatusNode/[key] "\x04status-node-"+[key] // /System/Max "\x05" // // /Table/[key] [key] // // /Min "" // /Max "\xff\xff" func PrettyPrint(key roachpb.Key) string { if bytes.Equal(key, MaxKey) { return "/Max" } else if bytes.Equal(key, MinKey) { return "/Min" } var buf bytes.Buffer for _, k := range keyDict { if key.Compare(k.start) >= 0 && (k.end == nil || key.Compare(k.end) <= 0) { fmt.Fprintf(&buf, "%s", k.name) if k.end != nil && k.end.Compare(key) == 0 { fmt.Fprintf(&buf, "/Max") return buf.String() } hasPrefix := false for _, e := range k.entries { if bytes.HasPrefix(key, e.prefix) { hasPrefix = true key = key[len(e.prefix):] fmt.Fprintf(&buf, "%s%s", e.name, e.ppFunc(key)) break } } if !hasPrefix { key = key[len(k.start):] fmt.Fprintf(&buf, "/%q", []byte(key)) } return buf.String() } } return fmt.Sprintf("%q", []byte(key)) }
// MetaScanBounds returns the range [start,end) within which the desired meta // record can be found by means of an engine scan. The given key must be a // valid RangeMetaKey as defined by validateRangeMetaKey. func MetaScanBounds(key roachpb.Key) (roachpb.Key, roachpb.Key, error) { if err := validateRangeMetaKey(key); err != nil { return nil, nil, err } if key.Equal(Meta2KeyMax) { return nil, nil, NewInvalidRangeMetaKeyError("Meta2KeyMax can't be used as the key of scan", key) } if key.Equal(roachpb.KeyMin) { // Special case KeyMin: find the first entry in meta1. return Meta1Prefix, Meta1Prefix.PrefixEnd(), nil } if key.Equal(Meta1KeyMax) { // Special case Meta1KeyMax: this is the last key in Meta1, we don't want // to start at Next(). return key, Meta1Prefix.PrefixEnd(), nil } // Otherwise find the first entry greater than the given key in the same meta prefix. return key.Next(), roachpb.Key(key[:len(Meta1Prefix)]).PrefixEnd(), nil }
func (sc *SchemaChanger) truncateAndBackfillColumnsChunk( added []sqlbase.ColumnDescriptor, dropped []sqlbase.ColumnDescriptor, defaultExprs []parser.TypedExpr, evalCtx *parser.EvalContext, sp sqlbase.Span, ) (roachpb.Key, bool, error) { var curIndexKey roachpb.Key done := false err := sc.db.Txn(func(txn *client.Txn) error { tableDesc, err := getTableDescFromID(txn, sc.tableID) if err != nil { return err } // Short circuit the backfill if the table has been deleted. if tableDesc.Deleted() { done = true return nil } updateCols := append(added, dropped...) fkTables := TablesNeededForFKs(*tableDesc, CheckUpdates) for k := range fkTables { if fkTables[k], err = getTableDescFromID(txn, k); err != nil { return err } } // TODO(dan): Tighten up the bound on the requestedCols parameter to // makeRowUpdater. requestedCols := make([]sqlbase.ColumnDescriptor, 0, len(tableDesc.Columns)+len(added)) requestedCols = append(requestedCols, tableDesc.Columns...) requestedCols = append(requestedCols, added...) ru, err := makeRowUpdater( txn, tableDesc, fkTables, updateCols, requestedCols, rowUpdaterOnlyColumns, ) if err != nil { return err } // TODO(dan): This check is an unfortunate bleeding of the internals of // rowUpdater. Extract the sql row to k/v mapping logic out into something // usable here. if !ru.isColumnOnlyUpdate() { panic("only column data should be modified, but the rowUpdater is configured otherwise") } // Run a scan across the table using the primary key. Running // the scan and applying the changes in many transactions is // fine because the schema change is in the correct state to // handle intermediate OLTP commands which delete and add // values during the scan. var rf sqlbase.RowFetcher colIDtoRowIndex := colIDtoRowIndexFromCols(tableDesc.Columns) valNeededForCol := make([]bool, len(tableDesc.Columns)) for i := range valNeededForCol { _, valNeededForCol[i] = ru.fetchColIDtoRowIndex[tableDesc.Columns[i].ID] } err = rf.Init(tableDesc, colIDtoRowIndex, &tableDesc.PrimaryIndex, false, false, tableDesc.Columns, valNeededForCol) if err != nil { return err } // StartScan uses 0 as a sentinal for the default limit of entries scanned. if err := rf.StartScan(txn, sqlbase.Spans{sp}, 0); err != nil { return err } indexKeyPrefix := sqlbase.MakeIndexKeyPrefix(tableDesc, tableDesc.PrimaryIndex.ID) oldValues := make(parser.DTuple, len(ru.fetchCols)) updateValues := make(parser.DTuple, len(updateCols)) writeBatch := &client.Batch{} var i int for ; i < ColumnTruncateAndBackfillChunkSize; i++ { row, err := rf.NextRow() if err != nil { return err } if row == nil { break // Done } curIndexKey, _, err = sqlbase.EncodeIndexKey( tableDesc, &tableDesc.PrimaryIndex, colIDtoRowIndex, row, indexKeyPrefix) for j, col := range added { if defaultExprs == nil || defaultExprs[j] == nil { updateValues[j] = parser.DNull } else { updateValues[j], err = defaultExprs[j].Eval(evalCtx) if err != nil { return err } } if !col.Nullable && updateValues[j].Compare(parser.DNull) == 0 { return sqlbase.NewNonNullViolationError(col.Name) } } for j := range dropped { updateValues[j+len(added)] = parser.DNull } copy(oldValues, row) for j := len(row); j < len(oldValues); j++ { oldValues[j] = parser.DNull } if _, err := ru.updateRow(writeBatch, oldValues, updateValues); err != nil { return err } } if i < ColumnTruncateAndBackfillChunkSize { done = true } if err := txn.Run(writeBatch); err != nil { return convertBackfillError(tableDesc, writeBatch) } return nil }) return curIndexKey.PrefixEnd(), done, err }
// fetch retrieves spans from the kv func (f *kvFetcher) fetch() error { batchSize := f.getBatchSize() b := &client.Batch{} b.Header.MaxScanResults = batchSize var resumeKey roachpb.Key if len(f.kvs) > 0 { resumeKey = f.kvs[len(f.kvs)-1].Key // To resume forward scans we will set the (inclusive) scan start to the Next of the last // received key. To resume reverse scans we will set the (exclusive) scan end to the last // received key. if !f.reverse { resumeKey = resumeKey.Next() } } atEnd := true if !f.reverse { for i := 0; i < len(f.spans); i++ { start := f.spans[i].Start if resumeKey != nil { if resumeKey.Compare(f.spans[i].End) >= 0 { // We are resuming from a key after this span. continue } if resumeKey.Compare(start) > 0 { // We are resuming from a key inside this span. // In this case we should technically reduce the max count for the span; but // since this count is only an optimization it's not incorrect to retrieve more // keys for the span. start = resumeKey } } atEnd = false b.Scan(start, f.spans[i].End, f.spans[i].Count) } } else { for i := len(f.spans) - 1; i >= 0; i-- { end := f.spans[i].End if resumeKey != nil { if resumeKey.Compare(f.spans[i].Start) <= 0 { // We are resuming from a key before this span. continue } if resumeKey.Compare(end) < 0 { // We are resuming from a key inside this span. // In this case we should technically reduce the max count for the span; but // since this count is only an optimization it's not incorrect to retrieve more // keys for the span. end = resumeKey } } atEnd = false b.ReverseScan(f.spans[i].Start, end, f.spans[i].Count) } } if atEnd { // The last scan happened to finish just at the end of the last span. f.kvs = nil f.fetchEnd = true return nil } if err := f.txn.Run(b); err != nil { return err } if f.kvs == nil { numResults := 0 for _, result := range b.Results { numResults += len(result.Rows) } f.kvs = make([]client.KeyValue, 0, numResults) } else { f.kvs = f.kvs[:0] } for _, result := range b.Results { f.kvs = append(f.kvs, result.Rows...) } f.batchIdx++ f.totalFetched += int64(len(f.kvs)) f.kvIndex = 0 if int64(len(f.kvs)) < batchSize { f.fetchEnd = true } // TODO(radu): We should fetch the next chunk in the background instead of waiting for the next // call to fetch(). We can use a pool of workers to issue the KV ops which will also limit the // total number of fetches that happen in parallel (and thus the amount of resources we use). return nil }
func (sc *SchemaChanger) truncateAndBackfillColumnsChunk( added []sqlbase.ColumnDescriptor, dropped []sqlbase.ColumnDescriptor, nonNullableColumn string, defaultExprs []parser.TypedExpr, evalCtx parser.EvalContext, sp sqlbase.Span, ) (roachpb.Key, bool, error) { var curSentinel roachpb.Key done := false err := sc.db.Txn(func(txn *client.Txn) error { tableDesc, err := getTableDescFromID(txn, sc.tableID) if err != nil { return err } // Short circuit the backfill if the table has been deleted. if tableDesc.Deleted { done = true return nil } // Run a scan across the table using the primary key. Running // the scan and applying the changes in many transactions is // fine because the schema change is in the correct state to // handle intermediate OLTP commands which delete and add // values during the scan. b := &client.Batch{} b.Scan(sp.Start, sp.End, ColumnTruncateAndBackfillChunkSize) if err := txn.Run(b); err != nil { return err } // Use a different batch to truncate/backfill columns. writeBatch := &client.Batch{} marshalled := make([]roachpb.Value, len(defaultExprs)) done = true for _, result := range b.Results { var sentinelKey roachpb.Key for _, kv := range result.Rows { // Still processing table. done = false if nonNullableColumn != "" { return fmt.Errorf("column %s contains null values", nonNullableColumn) } if sentinelKey == nil || !bytes.HasPrefix(kv.Key, sentinelKey) { // Sentinel keys have a 0 suffix indicating 0 bytes of // column ID. Strip off that suffix to determine the // prefix shared with the other keys for the row. sentinelKey = sqlbase.StripColumnIDLength(kv.Key) // Store away key for the next table row as the point from // which to start from. curSentinel = sentinelKey // Delete the entire dropped columns. This used to use SQL // UPDATE in the past to update the dropped column to // NULL; but a column in the process of being dropped is // placed in the table descriptor mutations, and a SQL // UPDATE of a column in mutations will fail. for _, columnDesc := range dropped { // Delete the dropped column. colKey := keys.MakeColumnKey(sentinelKey, uint32(columnDesc.ID)) if log.V(2) { log.Infof("Del %s", colKey) } writeBatch.Del(colKey) } // Add the new columns and backfill the values. for i, expr := range defaultExprs { if expr == nil { continue } col := added[i] colKey := keys.MakeColumnKey(sentinelKey, uint32(col.ID)) d, err := expr.Eval(evalCtx) if err != nil { return err } marshalled[i], err = sqlbase.MarshalColumnValue(col, d) if err != nil { return err } if log.V(2) { log.Infof("Put %s -> %v", colKey, d) } // Insert default value into the column. If this row // was recently added the default value might have // already been populated, because the // ColumnDescriptor is in the WRITE_ONLY state. // Reinserting the default value is not a big deal. // // Note: a column in the WRITE_ONLY state cannot be // populated directly through SQL. A SQL INSERT cannot // directly reference the column, and the INSERT // populates the column with the default value. writeBatch.Put(colKey, &marshalled[i]) } } } } if err := txn.Run(writeBatch); err != nil { for _, r := range writeBatch.Results { if r.PErr != nil { return convertBackfillError(tableDesc, writeBatch, r.PErr) } } return err } return nil }) return curSentinel.PrefixEnd(), done, err }
// Add the specified timestamp to the cache as covering the range of // keys from start to end. If end is nil, the range covers the start // key only. txnID is nil for no transaction. readTSCache specifies // whether the command adding this timestamp should update the read // timestamp; false to update the write timestamp cache. func (tc *TimestampCache) Add(start, end roachpb.Key, timestamp roachpb.Timestamp, txnID *uuid.UUID, readTSCache bool) { // This gives us a memory-efficient end key if end is empty. if len(end) == 0 { end = start.Next() start = end[:len(start)] } if tc.latest.Less(timestamp) { tc.latest = timestamp } // Only add to the cache if the timestamp is more recent than the // low water mark. if tc.lowWater.Less(timestamp) { cache := tc.wCache if readTSCache { cache = tc.rCache } addRange := func(r interval.Range) { value := cacheValue{timestamp: timestamp, txnID: txnID} key := cache.MakeKey(r.Start, r.End) entry := makeCacheEntry(key, value) cache.AddEntry(entry) } r := interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), } // Check existing, overlapping entries and truncate/split/remove if // superseded and in the past. If existing entries are in the future, // subtract from the range/ranges that need to be added to cache. for _, o := range cache.GetOverlaps(r.Start, r.End) { cv := o.Value.(*cacheValue) sCmp := r.Start.Compare(o.Key.Start) eCmp := r.End.Compare(o.Key.End) if !timestamp.Less(cv.timestamp) { // The existing interval has a timestamp less than or equal to the new interval. // Compare interval ranges to determine how to modify existing interval. switch { case sCmp == 0 && eCmp == 0: // New and old are equal; replace old with new and avoid the need to insert new. // // New: ------------ // Old: ------------ // // New: ------------ *cv = cacheValue{timestamp: timestamp, txnID: txnID} cache.MoveToEnd(o.Entry) return case sCmp <= 0 && eCmp >= 0: // New contains or is equal to old; delete old. // // New: ------------ ------------ ------------ // Old: -------- or ---------- or ---------- // // Old: cache.DelEntry(o.Entry) case sCmp > 0 && eCmp < 0: // Old contains new; split up old into two. // // New: ---- // Old: ------------ // // Old: ---- ---- oldEnd := o.Key.End o.Key.End = r.Start key := cache.MakeKey(r.End, oldEnd) entry := makeCacheEntry(key, *cv) cache.AddEntryAfter(entry, o.Entry) case eCmp >= 0: // Left partial overlap; truncate old end. // // New: -------- -------- // Old: -------- or ------------ // // Old: ---- ---- o.Key.End = r.Start case sCmp <= 0: // Right partial overlap; truncate old start. // // New: -------- -------- // Old: -------- or ------------ // // Old: ---- ---- o.Key.Start = r.End default: panic(fmt.Sprintf("no overlap between %v and %v", o.Key.Range, r)) } } else { // The existing interval has a timestamp greater than the new interval. // Compare interval ranges to determine how to modify new interval before // adding it to the timestamp cache. switch { case sCmp >= 0 && eCmp <= 0: // Old contains or is equal to new; no need to add. // // Old: ----------- ----------- ----------- ----------- // New: ----- or ----------- or -------- or -------- // // New: return case sCmp < 0 && eCmp > 0: // New contains old; split up old into two. We can add the left piece // immediately because it is guaranteed to be before the rest of the // overlaps. // // Old: ------ // New: ------------ // // New: --- --- lr := interval.Range{Start: r.Start, End: o.Key.Start} addRange(lr) r.Start = o.Key.End case eCmp > 0: // Left partial overlap; truncate new start. // // Old: -------- -------- // New: -------- or ------------ // // New: ---- ---- r.Start = o.Key.End case sCmp < 0: // Right partial overlap; truncate new end. // // Old: -------- -------- // New: -------- or ------------ // // New: ---- ---- r.End = o.Key.Start default: panic(fmt.Sprintf("no overlap between %v and %v", o.Key.Range, r)) } } } addRange(r) } }