// Filter makes decisions about garbage collection based on the // garbage collection policy for batches of values for the same key. // The GC policy is determined via the policyFn specified when the // GarbageCollector was created. Returns a slice of deletions, one // per incoming keys. If an index in the returned array is set to // true, then that value will be garbage collected. func (gc *GarbageCollector) Filter(keys []Key, values [][]byte) []bool { if len(keys) == 1 { return nil } // Look up the policy which applies to this set of MVCC values. _, decKey := encoding.DecodeBinary(keys[0]) policy := gc.policyFn(decKey) if policy == nil || policy.TTLSeconds <= 0 { return nil } toDelete := make([]bool, len(keys)) expiration := gc.now expiration.WallTime -= int64(policy.TTLSeconds) * 1E9 var survivors bool for i, key := range keys { _, ts, isValue := mvccDecodeKey(key) if i == 0 { if isValue { log.Errorf("unexpected MVCC value encountered: %q", key) return make([]bool, len(keys)) } continue } if !isValue { log.Errorf("unexpected MVCC metadata encountered: %q", key) return make([]bool, len(keys)) } mvccVal := proto.MVCCValue{} if err := gogoproto.Unmarshal(values[i], &mvccVal); err != nil { log.Errorf("unable to unmarshal MVCC value %q: %v", key, err) return make([]bool, len(keys)) } if i == 1 { // If the first value isn't a deletion tombstone, set survivors to true. if !mvccVal.Deleted { survivors = true } } else { if ts.Less(expiration) { // If we encounter a version older than our GC timestamp, mark for deletion. toDelete[i] = true } else if !mvccVal.Deleted { // Otherwise, if not marked for GC and not a tombstone, set survivors true. survivors = true } } } // If there are no remaining non-deleted, versioned entries, mark // all keys for deletion, including the MVCC metadata entry. if !survivors { for i := range keys { toDelete[i] = true } } return toDelete }
// ResolveWriteIntentRange commits or aborts (rolls back) the range of // write intents specified by start and end keys for a given txnID // according to commit parameter. ResolveWriteIntentRange will skip // write intents of other txnIDs. Specify max=0 for unbounded // resolves. func (mvcc *MVCC) ResolveWriteIntentRange(key Key, endKey Key, max int64, txnID []byte, commit bool) (int64, error) { if len(txnID) == 0 { return 0, util.Error("missing txnID in request") } binKey := encoding.EncodeBinary(nil, key) binEndKey := encoding.EncodeBinary(nil, endKey) nextKey := binKey num := int64(0) for { kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1) if err != nil { return num, err } // No more keys exists in the given range. if len(kvs) == 0 { break } remainder, currentKey := encoding.DecodeBinary(kvs[0].Key) if len(remainder) != 0 { return 0, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key) } _, _, existingTxnID, err := mvcc.getInternal(kvs[0].Key, proto.MaxTimestamp, txnID) // Return the error unless its a writeIntentError, which // will occur in the event we scan a key with a write // intent belonging to a different transaction. if _, ok := err.(*writeIntentError); err != nil && !ok { return num, err } // ResolveWriteIntent only needs to deal with the write // intents for the given txnID. if err == nil && bytes.Equal(existingTxnID, txnID) { // commits or aborts (rolls back) the write intent of // the given txnID. err = mvcc.ResolveWriteIntent(currentKey, txnID, commit) if err != nil { return num, err } num++ } if max != 0 && max == num { break } // In order to efficiently skip the possibly long list of // old versions for this key; refer to Scan for details. nextKey = encoding.EncodeBinary(nil, NextKey(currentKey)) } return num, nil }
// Scan scans the key range specified by start key through end key up // to some maximum number of results. Specify max=0 for unbounded scans. func (mvcc *MVCC) Scan(key Key, endKey Key, max int64, timestamp proto.Timestamp, txn *proto.Transaction) ([]proto.KeyValue, error) { binKey := encoding.EncodeBinary(nil, key) binEndKey := encoding.EncodeBinary(nil, endKey) nextKey := binKey res := []proto.KeyValue{} for { kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1) if err != nil { return nil, err } // No more keys exists in the given range. if len(kvs) == 0 { break } remainder, currentKey := encoding.DecodeBinary(kvs[0].Key) if len(remainder) != 0 { return nil, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key) } value, err := mvcc.Get(currentKey, timestamp, txn) if err != nil { return res, err } if value != nil { res = append(res, proto.KeyValue{Key: currentKey, Value: *value}) } if max != 0 && max == int64(len(res)) { break } // In order to efficiently skip the possibly long list of // old versions for this key, we move instead to the next // highest key and the for loop continues by scanning again // with nextKey. // Let's say you have: // a // a<T=2> // a<T=1> // aa // aa<T=3> // aa<T=2> // b // b<T=5> // In this case, if we scan from "a"-"b", we wish to skip // a<T=2> and a<T=1> and find "aa'. nextKey = encoding.EncodeBinary(nil, NextKey(currentKey)) } return res, nil }
// mvccDecodeKey decodes encodedKey into key and Timestamp. The final // returned bool is true if this is an MVCC value and false if this is // MVCC metadata. Note that the returned key is exactly the value of // key passed to mvccEncodeKey. A separate DecodeBinary step must be // carried out to decode it if necessary. // If a decode process fails, a panic ensues. func mvccDecodeKey(encodedKey []byte) (Key, proto.Timestamp, bool) { tsBytes, _ := encoding.DecodeBinary(encodedKey) key := encodedKey[:len(encodedKey)-len(tsBytes)] if len(tsBytes) == 0 { return key, proto.Timestamp{}, false } tsBytes, walltime := encoding.DecodeUint64Decreasing(tsBytes) tsBytes, logical := encoding.DecodeUint32Decreasing(tsBytes) if len(tsBytes) > 0 { panic(fmt.Sprintf("leftover bytes on mvcc key decode: %v", tsBytes)) } return key, proto.Timestamp{WallTime: int64(walltime), Logical: int32(logical)}, true }
// FindSplitKey suggests a split key from the given user-space key range that // aims to roughly cut into half the total number of bytes used (in raw key and // value byte strings) in both subranges. It will operate on a snapshot of the // underlying engine if a snapshotID is given, and in that case may safely be // invoked in a goroutine. // TODO(Tobias): leverage the work done here anyways to gather stats. func (mvcc *MVCC) FindSplitKey(key Key, endKey Key, snapshotID string) (Key, error) { rs := util.NewWeightedReservoirSample(splitReservoirSize, nil) h := rs.Heap.(*util.WeightedValueHeap) // We expect most keys to contain anywhere between 2^4 to 2^14 bytes, so we // normalize to obtain typical weights that are numerically unproblematic. // The relevant expression is rand(0,1)**(1/weight). normalize := float64(1 << 6) binStartKey := encoding.EncodeBinary(nil, key) binEndKey := encoding.EncodeBinary(nil, endKey) totalSize := 0 err := iterateRangeSnapshot(mvcc.engine, binStartKey, binEndKey, splitScanRowCount, snapshotID, func(kvs []proto.RawKeyValue) error { for _, kv := range kvs { byteCount := len(kv.Key) + len(kv.Value) rs.ConsiderWeighted(splitSampleItem{kv.Key, totalSize}, float64(byteCount)/normalize) totalSize += byteCount } return nil }) if err != nil { return nil, err } if totalSize == 0 { return nil, util.Errorf("the range is empty") } // Inspect the sample to get the closest candidate that has sizeBefore >= totalSize/2. candidate := (*h)[0].Value.(splitSampleItem) cb := candidate.sizeBefore halfSize := totalSize / 2 for i := 1; i < len(*h); i++ { if sb := (*h)[i].Value.(splitSampleItem).sizeBefore; (cb < halfSize && cb < sb) || (cb > halfSize && cb > sb && sb > halfSize) { // The current candidate hasn't yet cracked 50% and the this value // is closer to doing so or we're already above but now we can // decrese the gap. candidate = (*h)[i].Value.(splitSampleItem) cb = candidate.sizeBefore } } // The key is an MVCC key, so to avoid corrupting MVCC we get the // associated sentinel metadata key, which is fine to split in front of. decodedKey, _, _ := mvccDecodeKey(candidate.Key) rest, humanKey := encoding.DecodeBinary(decodedKey) if len(rest) > 0 { return nil, util.Errorf("corrupt key encountered") } return humanKey, nil }
// ResolveWriteIntentRange commits or aborts (rolls back) the range of // write intents specified by start and end keys for a given txn // according to commit parameter. ResolveWriteIntentRange will skip // write intents of other txns. Specify max=0 for unbounded resolves. func (mvcc *MVCC) ResolveWriteIntentRange(key Key, endKey Key, max int64, txn *proto.Transaction, commit bool) (int64, error) { if txn == nil { return 0, util.Error("no txn specified") } binKey := encoding.EncodeBinary(nil, key) binEndKey := encoding.EncodeBinary(nil, endKey) nextKey := binKey num := int64(0) for { kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1) if err != nil { return num, err } // No more keys exists in the given range. if len(kvs) == 0 { break } remainder, currentKey := encoding.DecodeBinary(kvs[0].Key) if len(remainder) != 0 { return 0, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key) } err = mvcc.ResolveWriteIntent(currentKey, txn, commit) if err != nil { log.Warningf("failed to resolve intent for key %q: %v", currentKey, err) } else { num++ if max != 0 && max == num { break } } // In order to efficiently skip the possibly long list of // old versions for this key; refer to Scan for details. nextKey = encoding.EncodeBinary(nil, NextKey(currentKey)) } return num, nil }