func (rc *ResponseCache) decodeResponseCacheKey(encKey proto.EncodedKey) (proto.ClientCmdID, error) { ret := proto.ClientCmdID{} key, _, isValue := engine.MVCCDecodeKey(encKey) if isValue { return ret, util.Errorf("key %s is not a raw MVCC value", encKey) } if !bytes.HasPrefix(key, keys.LocalRangeIDPrefix) { return ret, util.Errorf("key %s does not have %s prefix", key, keys.LocalRangeIDPrefix) } // Cut the prefix and the Raft ID. b := key[len(keys.LocalRangeIDPrefix):] b, _ = encoding.DecodeUvarint(b) if !bytes.HasPrefix(b, keys.LocalResponseCacheSuffix) { return ret, util.Errorf("key %s does not contain the response cache suffix %s", key, keys.LocalResponseCacheSuffix) } // Cut the response cache suffix. b = b[len(keys.LocalResponseCacheSuffix):] // Now, decode the command ID. b, wt := encoding.DecodeUvarint(b) b, rd := encoding.DecodeUint64(b) if len(b) > 0 { return ret, util.Errorf("key %s has leftover bytes after decode: %s; indicates corrupt key", encKey, b) } ret.WallTime = int64(wt) ret.Random = int64(rd) return ret, nil }
func (rc *ResponseCache) decodeResponseCacheKey(encKey engine.MVCCKey) ([]byte, error) { key, _, isValue, err := engine.MVCCDecodeKey(encKey) if err != nil { return nil, err } if isValue { return nil, util.Errorf("key %s is not a raw MVCC value", encKey) } if !bytes.HasPrefix(key, keys.LocalRangeIDPrefix) { return nil, util.Errorf("key %s does not have %s prefix", key, keys.LocalRangeIDPrefix) } // Cut the prefix and the Range ID. b := key[len(keys.LocalRangeIDPrefix):] b, _, err = encoding.DecodeUvarint(b) if err != nil { return nil, err } if !bytes.HasPrefix(b, keys.LocalResponseCacheSuffix) { return nil, util.Errorf("key %s does not contain the response cache suffix %s", key, keys.LocalResponseCacheSuffix) } // Cut the response cache suffix. b = b[len(keys.LocalResponseCacheSuffix):] // Decode the family. b, fm, err := encoding.DecodeBytes(b, nil) if err != nil { return nil, err } if len(b) > 0 { return nil, util.Errorf("key %s has leftover bytes after decode: %s; indicates corrupt key", encKey, b) } return fm, nil }
// TestGCQueueIntentResolution verifies intent resolution with many // intents spanning just two transactions. func TestGCQueueIntentResolution(t *testing.T) { defer leaktest.AfterTest(t) tc := testContext{} tc.Start(t) defer tc.Stop() const now int64 = 48 * 60 * 60 * 1E9 // 2d past the epoch tc.manualClock.Set(now) txns := []*proto.Transaction{ newTransaction("txn1", proto.Key("0-00000"), 1, proto.SERIALIZABLE, tc.clock), newTransaction("txn2", proto.Key("1-00000"), 1, proto.SERIALIZABLE, tc.clock), } intentResolveTS := makeTS(now-intentAgeThreshold.Nanoseconds(), 0) txns[0].OrigTimestamp = intentResolveTS txns[0].Timestamp = intentResolveTS txns[1].OrigTimestamp = intentResolveTS txns[1].Timestamp = intentResolveTS // Two transactions. for i := 0; i < 2; i++ { // 5 puts per transaction. // TODO(spencerkimball): benchmark with ~50k. for j := 0; j < 5; j++ { pArgs := putArgs(proto.Key(fmt.Sprintf("%d-%05d", i, j)), []byte("value"), tc.rng.Desc().RangeID, tc.store.StoreID()) pArgs.Timestamp = makeTS(1, 0) pArgs.Txn = txns[i] if _, err := tc.rng.AddCmd(tc.rng.context(), &pArgs); err != nil { t.Fatalf("%d: could not put data: %s", i, err) } } } // Process through a scan queue. gcQ := newGCQueue() if err := gcQ.process(tc.clock.Now(), tc.rng); err != nil { t.Fatal(err) } // Iterate through all values to ensure intents have been fully resolved. meta := &engine.MVCCMetadata{} err := tc.store.Engine().Iterate(engine.MVCCEncodeKey(proto.KeyMin), engine.MVCCEncodeKey(proto.KeyMax), func(kv proto.RawKeyValue) (bool, error) { if key, _, isValue := engine.MVCCDecodeKey(kv.Key); !isValue { if err := gogoproto.Unmarshal(kv.Value, meta); err != nil { t.Fatalf("unable to unmarshal mvcc metadata for key %s", key) } if meta.Txn != nil { t.Fatalf("non-nil Txn after GC for key %s", key) } } return false, nil }) if err != nil { t.Fatal(err) } }
func decodeSequenceCacheMVCCKey(encKey engine.MVCCKey, dest []byte) ([]byte, uint32, uint32, error) { key, _, isValue, err := engine.MVCCDecodeKey(encKey) if err != nil { return nil, 0, 0, err } if isValue { return nil, 0, 0, util.Errorf("key %s is not a raw MVCC value", encKey) } return decodeSequenceCacheKey(key, dest) }
// TestReplicaDataIterator creates three ranges {"a"-"b" (pre), "b"-"c" // (main test range), "c"-"d" (post)} and fills each with data. It // first verifies the contents of the "b"-"c" range, then deletes it // and verifies it's empty. Finally, it verifies the pre and post // ranges still contain the expected data. func TestReplicaDataIterator(t *testing.T) { defer leaktest.AfterTest(t) tc := testContext{ bootstrapMode: bootstrapRangeOnly, } tc.Start(t) defer tc.Stop() // See notes in EmptyRange test method for adjustment to descriptor. newDesc := *tc.rng.Desc() newDesc.StartKey = roachpb.RKey("b") newDesc.EndKey = roachpb.RKey("c") if err := tc.rng.setDesc(&newDesc); err != nil { t.Fatal(err) } // Create two more ranges, one before the test range and one after. preRng := createRange(tc.store, 2, roachpb.RKeyMin, roachpb.RKey("b")) if err := tc.store.AddReplicaTest(preRng); err != nil { t.Fatal(err) } postRng := createRange(tc.store, 3, roachpb.RKey("c"), roachpb.RKeyMax) if err := tc.store.AddReplicaTest(postRng); err != nil { t.Fatal(err) } // Create range data for all three ranges. preKeys := createRangeData(preRng, t) curKeys := createRangeData(tc.rng, t) postKeys := createRangeData(postRng, t) iter := newReplicaDataIterator(tc.rng.Desc(), tc.rng.store.Engine()) defer iter.Close() i := 0 for ; iter.Valid(); iter.Next() { if err := iter.Error(); err != nil { t.Fatal(err) } if i >= len(curKeys) { t.Fatal("there are more keys in the iteration than expected") } if key := iter.Key(); !key.Equal(curKeys[i]) { k1, ts1, _, err := engine.MVCCDecodeKey(key) if err != nil { t.Fatal(err) } k2, ts2, _, err := engine.MVCCDecodeKey(curKeys[i]) if err != nil { t.Fatal(err) } t.Errorf("%d: expected %q(%d); got %q(%d)", i, k2, ts2, k1, ts1) } i++ } if i != len(curKeys) { t.Fatal("there are fewer keys in the iteration than expected") } // Destroy range and verify that its data has been completely cleared. if err := tc.rng.Destroy(); err != nil { t.Fatal(err) } iter = newReplicaDataIterator(tc.rng.Desc(), tc.rng.store.Engine()) defer iter.Close() if iter.Valid() { // If the range is destroyed, only a tombstone key should be there. k1, _, _, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { t.Fatal(err) } if tombstoneKey := keys.RaftTombstoneKey(tc.rng.Desc().RangeID); !bytes.Equal(k1, tombstoneKey) { t.Errorf("expected a tombstone key %q, but found %q", tombstoneKey, k1) } if iter.Next(); iter.Valid() { t.Errorf("expected a destroyed replica to have only a tombstone key, but found more") } } else { t.Errorf("expected a tombstone key, but got an empty iteration") } // Verify the keys in pre & post ranges. for _, test := range []struct { r *Replica keys []roachpb.EncodedKey }{ {preRng, preKeys}, {postRng, postKeys}, } { iter = newReplicaDataIterator(test.r.Desc(), test.r.store.Engine()) defer iter.Close() i = 0 for ; iter.Valid(); iter.Next() { k1, ts1, _, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { t.Fatal(err) } if bytes.HasPrefix(k1, keys.StatusPrefix) { // Some data is written into the system prefix by Store.BootstrapRange, // but it is not in our expected key list so skip it. // TODO(bdarnell): validate this data instead of skipping it. continue } if key := iter.Key(); !key.Equal(test.keys[i]) { k2, ts2, _, err := engine.MVCCDecodeKey(test.keys[i]) if err != nil { t.Fatal(err) } t.Errorf("%d: key mismatch %q(%d) != %q(%d)", i, k1, ts1, k2, ts2) } i++ } if i != len(curKeys) { t.Fatal("there are fewer keys in the iteration than expected") } } }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) error { snap := repl.rm.Engine().NewSnapshot() desc := repl.Desc() iter := newRangeDataIterator(desc, snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return fmt.Errorf("could not find GC policy for range %s: %s", repl, err) } policy := zone.GC gcMeta := roachpb.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, *policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() // TODO(tschottdorf): execution will use a leader-assigned local // timestamp to compute intent age. While this should be fine, could // consider adding a Now timestamp to GCRequest which would be used // instead. gcArgs := &roachpb.GCRequest{ RequestHeader: roachpb.RequestHeader{ RangeID: desc.RangeID, }, } var mu sync.Mutex var oldestIntentNanos int64 = math.MaxInt64 var expBaseKey roachpb.Key var keys []roachpb.EncodedKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*roachpb.Transaction{} intentMap := map[string][]roachpb.Intent{} // updateOldestIntent atomically updates the oldest intent. updateOldestIntent := func(intentNanos int64) { mu.Lock() defer mu.Unlock() if intentNanos < oldestIntentNanos { oldestIntentNanos = intentNanos } } // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentMap[id] = append(intentMap[id], roachpb.Intent{Key: expBaseKey}) } else { updateOldestIntent(meta.Txn.OrigTimestamp.WallTime) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err) continue } if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []roachpb.EncodedKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { wg.Add(1) go gcq.pushTxn(repl, now, txn, updateOldestIntent, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for id, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentMap[id] { intent.Txn = *txn intents = append(intents, intent) } } } done := true if len(intents) > 0 { done = false repl.resolveIntents(repl.context(), intents) } // Set start and end keys. if len(gcArgs.Keys) > 0 { done = false gcArgs.Key = gcArgs.Keys[0].Key gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key.Next() } if done { return nil } // Send GC request through range. gcMeta.OldestIntentNanos = proto.Int64(oldestIntentNanos) gcArgs.GCMeta = *gcMeta if _, err := client.SendWrapped(repl, repl.context(), gcArgs); err != nil { return err } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return fmt.Errorf("could not find GC policy for range %s: %s", repl, err) } policy := zone.GC gcMeta := roachpb.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, *policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() gcArgs := &roachpb.GCRequest{} // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*roachpb.Transaction{} intentSpanMap := map[string][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentSpanMap[id] = append(intentSpanMap[id], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err) continue } if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() txnKeys, err := processTransactionTable(repl, txnMap, txnExp) if err != nil { return err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcArgs.Keys = append(gcArgs.Keys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) go pushTxn(repl, now, txn, roachpb.ABORT_TXN, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for id, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[id] { intents = append(intents, roachpb.Intent{Span: intent, Txn: *txn}) } } } if err := repl.resolveIntents(repl.context(), intents, true /* wait */, false /* !poison */); err != nil { return err } // Deal with any leftover sequence cache keys. There shouldn't be many of // them. gcArgs.Keys = append(gcArgs.Keys, processSequenceCache(repl, now, txnExp, txnMap)...) // Send GC request through range. gcArgs.GCMeta = *gcMeta var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now proto.Timestamp, repl *Replica) error { snap := repl.rm.Engine().NewSnapshot() iter := newRangeDataIterator(repl.Desc(), snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. policy, err := gcq.lookupGCPolicy(repl) if err != nil { return err } gcMeta := proto.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() gcArgs := &proto.GCRequest{ RequestHeader: proto.RequestHeader{ Timestamp: now, RangeID: repl.Desc().RangeID, }, } var mu sync.Mutex var oldestIntentNanos int64 = math.MaxInt64 var expBaseKey proto.Key var keys []proto.EncodedKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*proto.Transaction{} intentMap := map[string][]proto.Key{} // updateOldestIntent atomically updates the oldest intent. updateOldestIntent := func(intentNanos int64) { mu.Lock() defer mu.Unlock() if intentNanos < oldestIntentNanos { oldestIntentNanos = intentNanos } } // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := gogoproto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentMap[id] = append(intentMap[id], expBaseKey) } else { updateOldestIntent(meta.Txn.OrigTimestamp.WallTime) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(proto.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, proto.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue := engine.MVCCDecodeKey(iter.Key()) if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []proto.EncodedKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() // Set start and end keys. switch len(gcArgs.Keys) { case 0: return nil case 1: gcArgs.Key = gcArgs.Keys[0].Key gcArgs.EndKey = gcArgs.Key.Next() default: gcArgs.Key = gcArgs.Keys[0].Key gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key } // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { wg.Add(1) go gcq.pushTxn(repl, now, txn, updateOldestIntent, &wg) } wg.Wait() // Resolve all intents. // TODO(spencer): use a batch here when available. for id, txn := range txnMap { if txn.Status != proto.PENDING { // The transaction was successfully pushed, so resolve the intents. for _, key := range intentMap[id] { resolveArgs := &proto.ResolveIntentRequest{ RequestHeader: proto.RequestHeader{ Timestamp: now, Key: key, User: security.RootUser, Txn: txn, }, } if _, err := repl.AddCmd(repl.context(), resolveArgs); err != nil { log.Warningf("resolve of key %q failed: %s", key, err) updateOldestIntent(txn.OrigTimestamp.WallTime) } } } } // Send GC request through range. gcMeta.OldestIntentNanos = gogoproto.Int64(oldestIntentNanos) gcArgs.GCMeta = *gcMeta if _, err := repl.AddCmd(repl.context(), gcArgs); err != nil { return err } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }
// TestGCQueueProcess creates test data in the range over various time // scales and verifies that scan queue process properly GCs test data. func TestGCQueueProcess(t *testing.T) { defer leaktest.AfterTest(t) tc := testContext{} tc.Start(t) defer tc.Stop() const now int64 = 48 * 60 * 60 * 1E9 // 2d past the epoch tc.manualClock.Set(now) ts1 := makeTS(now-2*24*60*60*1E9+1, 0) // 2d old (add one nanosecond so we're not using zero timestamp) ts2 := makeTS(now-25*60*60*1E9, 0) // GC will occur at time=25 hours ts3 := makeTS(now-(intentAgeThreshold.Nanoseconds()+1), 0) // 2h+1ns old ts4 := makeTS(now-(intentAgeThreshold.Nanoseconds()-1), 0) // 2h-ns old ts5 := makeTS(now-1E9, 0) // 1s old key1 := proto.Key("a") key2 := proto.Key("b") key3 := proto.Key("c") key4 := proto.Key("d") key5 := proto.Key("e") key6 := proto.Key("f") key7 := proto.Key("g") key8 := proto.Key("h") key9 := proto.Key("i") data := []struct { key proto.Key ts proto.Timestamp del bool txn bool }{ // For key1, we expect first two values to GC. {key1, ts1, false, false}, {key1, ts2, false, false}, {key1, ts5, false, false}, // For key2, we expect all values to GC, because most recent is deletion. {key2, ts1, false, false}, {key2, ts2, false, false}, {key2, ts5, true, false}, // For key3, we expect just ts1 to GC, because most recent deletion is intent. {key3, ts1, false, false}, {key3, ts2, false, false}, {key3, ts5, true, true}, // For key4, expect oldest value to GC. {key4, ts1, false, false}, {key4, ts2, false, false}, // For key5, expect all values to GC (most recent value deleted). {key5, ts1, false, false}, {key5, ts2, true, false}, // For key6, expect no values to GC because most recent value is intent. {key6, ts1, false, false}, {key6, ts5, true, true}, // For key7, expect no values to GC because intent is exactly 2h old. {key7, ts2, false, false}, {key7, ts4, true, true}, // For key8, expect most recent value to resolve by aborting, which will clean it up. {key8, ts2, false, false}, {key8, ts3, true, true}, // /For key9, resolve naked intent with no remaining values. {key9, ts3, true, false}, } for i, datum := range data { if datum.del { dArgs, dReply := deleteArgs(datum.key, tc.rng.Desc().RaftID, tc.store.StoreID()) dArgs.Timestamp = datum.ts if datum.txn { dArgs.Txn = newTransaction("test", datum.key, 1, proto.SERIALIZABLE, tc.clock) dArgs.Txn.Timestamp = datum.ts } if err := tc.rng.AddCmd(tc.rng.context(), proto.Call{Args: dArgs, Reply: dReply}); err != nil { t.Fatalf("%d: could not delete data: %s", i, err) } } else { pArgs, pReply := putArgs(datum.key, []byte("value"), tc.rng.Desc().RaftID, tc.store.StoreID()) pArgs.Timestamp = datum.ts if datum.txn { pArgs.Txn = newTransaction("test", datum.key, 1, proto.SERIALIZABLE, tc.clock) pArgs.Txn.Timestamp = datum.ts } if err := tc.rng.AddCmd(tc.rng.context(), proto.Call{Args: pArgs, Reply: pReply}); err != nil { t.Fatalf("%d: could not put data: %s", i, err) } } } // Process through a scan queue. gcQ := newGCQueue() if err := gcQ.process(tc.clock.Now(), tc.rng); err != nil { t.Error(err) } expKVs := []struct { key proto.Key ts proto.Timestamp }{ {key1, proto.ZeroTimestamp}, {key1, ts5}, {key3, proto.ZeroTimestamp}, {key3, ts5}, {key3, ts2}, {key4, proto.ZeroTimestamp}, {key4, ts2}, {key6, proto.ZeroTimestamp}, {key6, ts5}, {key6, ts1}, {key7, proto.ZeroTimestamp}, {key7, ts4}, {key7, ts2}, {key8, proto.ZeroTimestamp}, {key8, ts2}, } // Read data directly from engine to avoid intent errors from MVCC. kvs, err := engine.Scan(tc.store.Engine(), engine.MVCCEncodeKey(key1), engine.MVCCEncodeKey(proto.KeyMax), 0) if err != nil { t.Fatal(err) } for i, kv := range kvs { if key, ts, isValue := engine.MVCCDecodeKey(kv.Key); isValue { if log.V(1) { log.Infof("%d: %q, ts=%s", i, key, ts) } } else { if log.V(1) { log.Infof("%d: %q meta", i, key) } } } if len(kvs) != len(expKVs) { t.Fatalf("expected length %d; got %d", len(expKVs), len(kvs)) } for i, kv := range kvs { key, ts, isValue := engine.MVCCDecodeKey(kv.Key) if !key.Equal(expKVs[i].key) { t.Errorf("%d: expected key %q; got %q", i, expKVs[i].key, key) } if !ts.Equal(expKVs[i].ts) { t.Errorf("%d: expected ts=%s; got %s", i, expKVs[i].ts, ts) } if isValue { if log.V(1) { log.Infof("%d: %q, ts=%s", i, key, ts) } } else { if log.V(1) { log.Infof("%d: %q meta", i, key) } } } // Verify the oldest extant intent age. gcMeta, err := tc.rng.GetGCMetadata() if err != nil { t.Fatal(err) } if gcMeta.LastScanNanos != now { t.Errorf("expected last scan nanos=%d; got %d", now, gcMeta.LastScanNanos) } if *gcMeta.OldestIntentNanos != ts4.WallTime { t.Errorf("expected oldest intent nanos=%d; got %d", ts4.WallTime, gcMeta.OldestIntentNanos) } // Verify that the last verification timestamp was updated as whole range was scanned. ts, err := tc.rng.GetLastVerificationTimestamp() if err != nil { t.Fatal(err) } if gcMeta.LastScanNanos != ts.WallTime { t.Errorf("expected walltime nanos %d; got %d", gcMeta.LastScanNanos, ts.WallTime) } }
// TestGCQueueIntentResolution verifies intent resolution with many // intents spanning just two transactions. func TestGCQueueIntentResolution(t *testing.T) { defer leaktest.AfterTest(t) tc := testContext{} tc.Start(t) defer tc.Stop() const now int64 = 48 * 60 * 60 * 1E9 // 2d past the epoch tc.manualClock.Set(now) txns := []*roachpb.Transaction{ newTransaction("txn1", roachpb.Key("0-00000"), 1, roachpb.SERIALIZABLE, tc.clock), newTransaction("txn2", roachpb.Key("1-00000"), 1, roachpb.SERIALIZABLE, tc.clock), } intentResolveTS := makeTS(now-intentAgeThreshold.Nanoseconds(), 0) txns[0].OrigTimestamp = intentResolveTS txns[0].Timestamp = intentResolveTS txns[1].OrigTimestamp = intentResolveTS txns[1].Timestamp = intentResolveTS // Two transactions. for i := 0; i < 2; i++ { // 5 puts per transaction. // TODO(spencerkimball): benchmark with ~50k. for j := 0; j < 5; j++ { pArgs := putArgs(roachpb.Key(fmt.Sprintf("%d-%05d", i, j)), []byte("value")) if _, err := client.SendWrappedWith(tc.Sender(), tc.rng.context(), roachpb.BatchRequest_Header{ Txn: txns[i], }, &pArgs); err != nil { t.Fatalf("%d: could not put data: %s", i, err) } } } cfg := tc.gossip.GetSystemConfig() if cfg == nil { t.Fatal("nil config") } // Process through a scan queue. gcQ := newGCQueue(tc.gossip) if err := gcQ.process(tc.clock.Now(), tc.rng, cfg); err != nil { t.Fatal(err) } // Iterate through all values to ensure intents have been fully resolved. meta := &engine.MVCCMetadata{} err := tc.store.Engine().Iterate(engine.MVCCEncodeKey(roachpb.KeyMin), engine.MVCCEncodeKey(roachpb.KeyMax), func(kv roachpb.RawKeyValue) (bool, error) { if key, _, isValue, err := engine.MVCCDecodeKey(kv.Key); err != nil { return false, err } else if !isValue { if err := proto.Unmarshal(kv.Value, meta); err != nil { return false, err } if meta.Txn != nil { return false, util.Errorf("non-nil Txn after GC for key %s", key) } } return false, nil }) if err != nil { t.Fatal(err) } }
// TestRangeDataIterator creates three ranges {"a"-"b" (pre), "b"-"c" // (main test range), "c"-"d" (post)} and fills each with data. It // first verifies the contents of the "b"-"c" range, then deletes it // and verifies it's empty. Finally, it verifies the pre and post // ranges still contain the expected data. // // TODO This test fails since we automatically elect a leader upon // creation of the group. It's relying on the Raft storage not having written // anything during the duration of the test. // // TODO(tschottdorf): Since leaders are auto-elected upon creating the range, // the group storage is written to and confuses the iterator test. // Setting tc.dormantRaft = true isn't enough since there are two more ranges // added below, and those also get started automatically. func disabledTestRangeDataIterator(t *testing.T) { defer leaktest.AfterTest(t) tc := testContext{ bootstrapMode: bootstrapRangeOnly, } tc.Start(t) defer tc.Stop() // See notes in EmptyRange test method for adjustment to descriptor. newDesc := *tc.rng.Desc() newDesc.StartKey = proto.Key("b") newDesc.EndKey = proto.Key("c") if err := tc.rng.setDesc(&newDesc); err != nil { t.Fatal(err) } // Create two more ranges, one before the test range and one after. preRng := createRange(tc.store, 2, proto.KeyMin, proto.Key("b")) if err := tc.store.AddRangeTest(preRng); err != nil { t.Fatal(err) } postRng := createRange(tc.store, 3, proto.Key("c"), proto.KeyMax) if err := tc.store.AddRangeTest(postRng); err != nil { t.Fatal(err) } // Create range data for all three ranges. preKeys := createRangeData(preRng, t) curKeys := createRangeData(tc.rng, t) postKeys := createRangeData(postRng, t) iter := newRangeDataIterator(tc.rng.Desc(), tc.rng.rm.Engine()) defer iter.Close() i := 0 for ; iter.Valid(); iter.Next() { if err := iter.Error(); err != nil { t.Fatal(err) } if i >= len(curKeys) { t.Fatal("there are more keys in the iteration than expected") } if key := iter.Key(); !key.Equal(curKeys[i]) { k1, ts1, _ := engine.MVCCDecodeKey(key) k2, ts2, _ := engine.MVCCDecodeKey(curKeys[i]) t.Errorf("%d: expected %q(%d); got %q(%d)", i, k2, ts2, k1, ts1) } i++ } if i != len(curKeys) { t.Fatal("there are fewer keys in the iteration than expected") } // Destroy range and verify that its data has been completely cleared. if err := tc.rng.Destroy(); err != nil { t.Fatal(err) } iter = newRangeDataIterator(tc.rng.Desc(), tc.rng.rm.Engine()) defer iter.Close() if iter.Valid() { t.Errorf("expected empty iteration; got first key %q", iter.Key()) } // Verify the keys in pre & post ranges. for _, test := range []struct { r *Range keys []proto.EncodedKey }{ {preRng, preKeys}, {postRng, postKeys}, } { iter = newRangeDataIterator(test.r.Desc(), test.r.rm.Engine()) defer iter.Close() i = 0 for ; iter.Valid(); iter.Next() { k1, ts1, _ := engine.MVCCDecodeKey(iter.Key()) if bytes.HasPrefix(k1, keys.ConfigAccountingPrefix) || bytes.HasPrefix(k1, keys.ConfigPermissionPrefix) || bytes.HasPrefix(k1, keys.ConfigZonePrefix) || bytes.HasPrefix(k1, keys.StatusPrefix) { // Some data is written into the system prefix by Store.BootstrapRange, // but it is not in our expected key list so skip it. // TODO(bdarnell): validate this data instead of skipping it. continue } if key := iter.Key(); !key.Equal(test.keys[i]) { k2, ts2, _ := engine.MVCCDecodeKey(test.keys[i]) t.Errorf("%d: key mismatch %q(%d) != %q(%d)", i, k1, ts1, k2, ts2) } i++ } if i != len(curKeys) { t.Fatal("there are fewer keys in the iteration than expected") } } }
// process iterates through all keys in a range, calling the garbage // collector for each key and associated set of values. GC'd keys are // batched into InternalGC calls. Extant intents are resolved if // intents are older than intentAgeThreshold. func (gcq *gcQueue) process(now proto.Timestamp, rng *Range) error { snap := rng.rm.Engine().NewSnapshot() iter := newRangeDataIterator(rng.Desc(), snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. policy, err := gcq.lookupGCPolicy(rng) if err != nil { return err } gcMeta := proto.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() gcArgs := &proto.InternalGCRequest{ RequestHeader: proto.RequestHeader{ Timestamp: now, RaftID: rng.Desc().RaftID, }, } var mu sync.Mutex var oldestIntentNanos int64 = math.MaxInt64 var wg sync.WaitGroup var expBaseKey proto.Key var keys []proto.EncodedKey var vals [][]byte // updateOldestIntent atomically updates the oldest intent. updateOldestIntent := func(intentNanos int64) { mu.Lock() defer mu.Unlock() if intentNanos < oldestIntentNanos { oldestIntentNanos = intentNanos } } // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := gogoproto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Resolve intent asynchronously in a goroutine if the intent // is older than the intent expiration threshold. if meta.Timestamp.Less(intentExp) { wg.Add(1) go gcq.resolveIntent(rng, expBaseKey, meta, updateOldestIntent, &wg) } else { updateOldestIntent(meta.Timestamp.WallTime) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(proto.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, proto.InternalGCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through this range's keys and values. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue := engine.MVCCDecodeKey(iter.Key()) if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []proto.EncodedKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() // Set start and end keys. switch len(gcArgs.Keys) { case 0: return nil case 1: gcArgs.Key = gcArgs.Keys[0].Key gcArgs.EndKey = gcArgs.Key.Next() default: gcArgs.Key = gcArgs.Keys[0].Key gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key } // Wait for any outstanding intent resolves and set oldest extant intent. wg.Wait() gcMeta.OldestIntentNanos = gogoproto.Int64(oldestIntentNanos) // Send GC request through range. gcArgs.GCMeta = *gcMeta if err := rng.AddCmd(rng.context(), proto.Call{Args: gcArgs, Reply: &proto.InternalGCResponse{}}); err != nil { return err } // Store current timestamp as last verification for this range, as // we've just successfully scanned. if err := rng.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for range %s: %s", rng, err) } return nil }