Example #1
0
func (rc *ResponseCache) decodeResponseCacheKey(encKey proto.EncodedKey) (proto.ClientCmdID, error) {
	ret := proto.ClientCmdID{}
	key, _, isValue := engine.MVCCDecodeKey(encKey)
	if isValue {
		return ret, util.Errorf("key %s is not a raw MVCC value", encKey)
	}
	if !bytes.HasPrefix(key, keys.LocalRangeIDPrefix) {
		return ret, util.Errorf("key %s does not have %s prefix", key, keys.LocalRangeIDPrefix)
	}
	// Cut the prefix and the Raft ID.
	b := key[len(keys.LocalRangeIDPrefix):]
	b, _ = encoding.DecodeUvarint(b)
	if !bytes.HasPrefix(b, keys.LocalResponseCacheSuffix) {
		return ret, util.Errorf("key %s does not contain the response cache suffix %s",
			key, keys.LocalResponseCacheSuffix)
	}
	// Cut the response cache suffix.
	b = b[len(keys.LocalResponseCacheSuffix):]
	// Now, decode the command ID.
	b, wt := encoding.DecodeUvarint(b)
	b, rd := encoding.DecodeUint64(b)
	if len(b) > 0 {
		return ret, util.Errorf("key %s has leftover bytes after decode: %s; indicates corrupt key",
			encKey, b)
	}
	ret.WallTime = int64(wt)
	ret.Random = int64(rd)
	return ret, nil
}
Example #2
0
func (rc *ResponseCache) decodeResponseCacheKey(encKey engine.MVCCKey) ([]byte, error) {
	key, _, isValue, err := engine.MVCCDecodeKey(encKey)
	if err != nil {
		return nil, err
	}
	if isValue {
		return nil, util.Errorf("key %s is not a raw MVCC value", encKey)
	}
	if !bytes.HasPrefix(key, keys.LocalRangeIDPrefix) {
		return nil, util.Errorf("key %s does not have %s prefix", key, keys.LocalRangeIDPrefix)
	}
	// Cut the prefix and the Range ID.
	b := key[len(keys.LocalRangeIDPrefix):]
	b, _, err = encoding.DecodeUvarint(b)
	if err != nil {
		return nil, err
	}
	if !bytes.HasPrefix(b, keys.LocalResponseCacheSuffix) {
		return nil, util.Errorf("key %s does not contain the response cache suffix %s",
			key, keys.LocalResponseCacheSuffix)
	}
	// Cut the response cache suffix.
	b = b[len(keys.LocalResponseCacheSuffix):]
	// Decode the family.
	b, fm, err := encoding.DecodeBytes(b, nil)
	if err != nil {
		return nil, err
	}
	if len(b) > 0 {
		return nil, util.Errorf("key %s has leftover bytes after decode: %s; indicates corrupt key",
			encKey, b)
	}
	return fm, nil
}
Example #3
0
// TestGCQueueIntentResolution verifies intent resolution with many
// intents spanning just two transactions.
func TestGCQueueIntentResolution(t *testing.T) {
	defer leaktest.AfterTest(t)
	tc := testContext{}
	tc.Start(t)
	defer tc.Stop()

	const now int64 = 48 * 60 * 60 * 1E9 // 2d past the epoch
	tc.manualClock.Set(now)

	txns := []*proto.Transaction{
		newTransaction("txn1", proto.Key("0-00000"), 1, proto.SERIALIZABLE, tc.clock),
		newTransaction("txn2", proto.Key("1-00000"), 1, proto.SERIALIZABLE, tc.clock),
	}
	intentResolveTS := makeTS(now-intentAgeThreshold.Nanoseconds(), 0)
	txns[0].OrigTimestamp = intentResolveTS
	txns[0].Timestamp = intentResolveTS
	txns[1].OrigTimestamp = intentResolveTS
	txns[1].Timestamp = intentResolveTS

	// Two transactions.
	for i := 0; i < 2; i++ {
		// 5 puts per transaction.
		// TODO(spencerkimball): benchmark with ~50k.
		for j := 0; j < 5; j++ {
			pArgs := putArgs(proto.Key(fmt.Sprintf("%d-%05d", i, j)), []byte("value"), tc.rng.Desc().RangeID, tc.store.StoreID())
			pArgs.Timestamp = makeTS(1, 0)
			pArgs.Txn = txns[i]
			if _, err := tc.rng.AddCmd(tc.rng.context(), &pArgs); err != nil {
				t.Fatalf("%d: could not put data: %s", i, err)
			}
		}
	}

	// Process through a scan queue.
	gcQ := newGCQueue()
	if err := gcQ.process(tc.clock.Now(), tc.rng); err != nil {
		t.Fatal(err)
	}

	// Iterate through all values to ensure intents have been fully resolved.
	meta := &engine.MVCCMetadata{}
	err := tc.store.Engine().Iterate(engine.MVCCEncodeKey(proto.KeyMin), engine.MVCCEncodeKey(proto.KeyMax), func(kv proto.RawKeyValue) (bool, error) {
		if key, _, isValue := engine.MVCCDecodeKey(kv.Key); !isValue {
			if err := gogoproto.Unmarshal(kv.Value, meta); err != nil {
				t.Fatalf("unable to unmarshal mvcc metadata for key %s", key)
			}
			if meta.Txn != nil {
				t.Fatalf("non-nil Txn after GC for key %s", key)
			}
		}
		return false, nil
	})
	if err != nil {
		t.Fatal(err)
	}
}
Example #4
0
func decodeSequenceCacheMVCCKey(encKey engine.MVCCKey, dest []byte) ([]byte, uint32, uint32, error) {
	key, _, isValue, err := engine.MVCCDecodeKey(encKey)
	if err != nil {
		return nil, 0, 0, err
	}
	if isValue {
		return nil, 0, 0, util.Errorf("key %s is not a raw MVCC value", encKey)
	}
	return decodeSequenceCacheKey(key, dest)
}
// TestReplicaDataIterator creates three ranges {"a"-"b" (pre), "b"-"c"
// (main test range), "c"-"d" (post)} and fills each with data. It
// first verifies the contents of the "b"-"c" range, then deletes it
// and verifies it's empty. Finally, it verifies the pre and post
// ranges still contain the expected data.
func TestReplicaDataIterator(t *testing.T) {
	defer leaktest.AfterTest(t)
	tc := testContext{
		bootstrapMode: bootstrapRangeOnly,
	}
	tc.Start(t)
	defer tc.Stop()

	// See notes in EmptyRange test method for adjustment to descriptor.
	newDesc := *tc.rng.Desc()
	newDesc.StartKey = roachpb.RKey("b")
	newDesc.EndKey = roachpb.RKey("c")
	if err := tc.rng.setDesc(&newDesc); err != nil {
		t.Fatal(err)
	}
	// Create two more ranges, one before the test range and one after.
	preRng := createRange(tc.store, 2, roachpb.RKeyMin, roachpb.RKey("b"))
	if err := tc.store.AddReplicaTest(preRng); err != nil {
		t.Fatal(err)
	}
	postRng := createRange(tc.store, 3, roachpb.RKey("c"), roachpb.RKeyMax)
	if err := tc.store.AddReplicaTest(postRng); err != nil {
		t.Fatal(err)
	}

	// Create range data for all three ranges.
	preKeys := createRangeData(preRng, t)
	curKeys := createRangeData(tc.rng, t)
	postKeys := createRangeData(postRng, t)

	iter := newReplicaDataIterator(tc.rng.Desc(), tc.rng.store.Engine())
	defer iter.Close()
	i := 0
	for ; iter.Valid(); iter.Next() {
		if err := iter.Error(); err != nil {
			t.Fatal(err)
		}
		if i >= len(curKeys) {
			t.Fatal("there are more keys in the iteration than expected")
		}
		if key := iter.Key(); !key.Equal(curKeys[i]) {
			k1, ts1, _, err := engine.MVCCDecodeKey(key)
			if err != nil {
				t.Fatal(err)
			}
			k2, ts2, _, err := engine.MVCCDecodeKey(curKeys[i])
			if err != nil {
				t.Fatal(err)
			}
			t.Errorf("%d: expected %q(%d); got %q(%d)", i, k2, ts2, k1, ts1)
		}
		i++
	}
	if i != len(curKeys) {
		t.Fatal("there are fewer keys in the iteration than expected")
	}

	// Destroy range and verify that its data has been completely cleared.
	if err := tc.rng.Destroy(); err != nil {
		t.Fatal(err)
	}
	iter = newReplicaDataIterator(tc.rng.Desc(), tc.rng.store.Engine())
	defer iter.Close()
	if iter.Valid() {
		// If the range is destroyed, only a tombstone key should be there.
		k1, _, _, err := engine.MVCCDecodeKey(iter.Key())
		if err != nil {
			t.Fatal(err)
		}
		if tombstoneKey := keys.RaftTombstoneKey(tc.rng.Desc().RangeID); !bytes.Equal(k1, tombstoneKey) {
			t.Errorf("expected a tombstone key %q, but found %q", tombstoneKey, k1)
		}

		if iter.Next(); iter.Valid() {
			t.Errorf("expected a destroyed replica to have only a tombstone key, but found more")
		}
	} else {
		t.Errorf("expected a tombstone key, but got an empty iteration")
	}

	// Verify the keys in pre & post ranges.
	for _, test := range []struct {
		r    *Replica
		keys []roachpb.EncodedKey
	}{
		{preRng, preKeys},
		{postRng, postKeys},
	} {
		iter = newReplicaDataIterator(test.r.Desc(), test.r.store.Engine())
		defer iter.Close()
		i = 0
		for ; iter.Valid(); iter.Next() {
			k1, ts1, _, err := engine.MVCCDecodeKey(iter.Key())
			if err != nil {
				t.Fatal(err)
			}
			if bytes.HasPrefix(k1, keys.StatusPrefix) {
				// Some data is written into the system prefix by Store.BootstrapRange,
				// but it is not in our expected key list so skip it.
				// TODO(bdarnell): validate this data instead of skipping it.
				continue
			}
			if key := iter.Key(); !key.Equal(test.keys[i]) {
				k2, ts2, _, err := engine.MVCCDecodeKey(test.keys[i])
				if err != nil {
					t.Fatal(err)
				}
				t.Errorf("%d: key mismatch %q(%d) != %q(%d)", i, k1, ts1, k2, ts2)
			}
			i++
		}
		if i != len(curKeys) {
			t.Fatal("there are fewer keys in the iteration than expected")
		}
	}
}
Example #6
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold.
func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica,
	sysCfg *config.SystemConfig) error {

	snap := repl.rm.Engine().NewSnapshot()
	desc := repl.Desc()
	iter := newRangeDataIterator(desc, snap)
	defer iter.Close()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return fmt.Errorf("could not find GC policy for range %s: %s", repl, err)
	}
	policy := zone.GC

	gcMeta := roachpb.NewGCMetadata(now.WallTime)
	gc := engine.NewGarbageCollector(now, *policy)

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()

	// TODO(tschottdorf): execution will use a leader-assigned local
	// timestamp to compute intent age. While this should be fine, could
	// consider adding a Now timestamp to GCRequest which would be used
	// instead.
	gcArgs := &roachpb.GCRequest{
		RequestHeader: roachpb.RequestHeader{
			RangeID: desc.RangeID,
		},
	}
	var mu sync.Mutex
	var oldestIntentNanos int64 = math.MaxInt64
	var expBaseKey roachpb.Key
	var keys []roachpb.EncodedKey
	var vals [][]byte

	// Maps from txn ID to txn and intent key slice.
	txnMap := map[string]*roachpb.Transaction{}
	intentMap := map[string][]roachpb.Intent{}

	// updateOldestIntent atomically updates the oldest intent.
	updateOldestIntent := func(intentNanos int64) {
		mu.Lock()
		defer mu.Unlock()
		if intentNanos < oldestIntentNanos {
			oldestIntentNanos = intentNanos
		}
	}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &engine.MVCCMetadata{}
			if err := proto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Keep track of intent to resolve if older than the intent
					// expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						id := string(meta.Txn.ID)
						txnMap[id] = meta.Txn
						intentMap[id] = append(intentMap[id], roachpb.Intent{Key: expBaseKey})
					} else {
						updateOldestIntent(meta.Txn.OrigTimestamp.WallTime)
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through the keys and values of this replica's range.
	for ; iter.Valid(); iter.Next() {
		baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key())
		if err != nil {
			log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err)
			continue
		}
		if !isValue {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = baseKey
			keys = []roachpb.EncodedKey{iter.Key()}
			vals = [][]byte{iter.Value()}
		} else {
			if !baseKey.Equal(expBaseKey) {
				log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey)
				continue
			}
			keys = append(keys, iter.Key())
			vals = append(vals, iter.Value())
		}
	}
	if iter.Error() != nil {
		return iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()

	// Process push transactions in parallel.
	var wg sync.WaitGroup
	for _, txn := range txnMap {
		wg.Add(1)
		go gcq.pushTxn(repl, now, txn, updateOldestIntent, &wg)
	}
	wg.Wait()

	// Resolve all intents.
	var intents []roachpb.Intent
	for id, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			for _, intent := range intentMap[id] {
				intent.Txn = *txn
				intents = append(intents, intent)
			}
		}
	}

	done := true
	if len(intents) > 0 {
		done = false
		repl.resolveIntents(repl.context(), intents)
	}

	// Set start and end keys.
	if len(gcArgs.Keys) > 0 {
		done = false
		gcArgs.Key = gcArgs.Keys[0].Key
		gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key.Next()
	}

	if done {
		return nil
	}

	// Send GC request through range.
	gcMeta.OldestIntentNanos = proto.Int64(oldestIntentNanos)
	gcArgs.GCMeta = *gcMeta
	if _, err := client.SendWrapped(repl, repl.context(), gcArgs); err != nil {
		return err
	}

	// Store current timestamp as last verification for this replica, as
	// we've just successfully scanned.
	if err := repl.SetLastVerificationTimestamp(now); err != nil {
		log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err)
	}

	return nil
}
Example #7
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold.
func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica,
	sysCfg *config.SystemConfig) error {

	snap := repl.store.Engine().NewSnapshot()
	desc := repl.Desc()
	iter := newReplicaDataIterator(desc, snap)
	defer iter.Close()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return fmt.Errorf("could not find GC policy for range %s: %s", repl, err)
	}
	policy := zone.GC

	gcMeta := roachpb.NewGCMetadata(now.WallTime)
	gc := engine.NewGarbageCollector(now, *policy)

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()
	txnExp := now
	txnExp.WallTime -= txnCleanupThreshold.Nanoseconds()

	gcArgs := &roachpb.GCRequest{}
	// TODO(tschottdorf): This is one of these instances in which we want
	// to be more careful that the request ends up on the correct Replica,
	// and we might have to worry about mixing range-local and global keys
	// in a batch which might end up spanning Ranges by the time it executes.
	gcArgs.Key = desc.StartKey.AsRawKey()
	gcArgs.EndKey = desc.EndKey.AsRawKey()

	var expBaseKey roachpb.Key
	var keys []engine.MVCCKey
	var vals [][]byte

	// Maps from txn ID to txn and intent key slice.
	txnMap := map[string]*roachpb.Transaction{}
	intentSpanMap := map[string][]roachpb.Span{}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &engine.MVCCMetadata{}
			if err := proto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Keep track of intent to resolve if older than the intent
					// expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						id := string(meta.Txn.ID)
						txnMap[id] = meta.Txn
						intentSpanMap[id] = append(intentSpanMap[id], roachpb.Span{Key: expBaseKey})
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through the keys and values of this replica's range.
	for ; iter.Valid(); iter.Next() {
		baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key())
		if err != nil {
			log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err)
			continue
		}
		if !isValue {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = baseKey
			keys = []engine.MVCCKey{iter.Key()}
			vals = [][]byte{iter.Value()}
		} else {
			if !baseKey.Equal(expBaseKey) {
				log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey)
				continue
			}
			keys = append(keys, iter.Key())
			vals = append(vals, iter.Value())
		}
	}
	if iter.Error() != nil {
		return iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()

	txnKeys, err := processTransactionTable(repl, txnMap, txnExp)
	if err != nil {
		return err
	}

	// From now on, all newly added keys are range-local.
	// TODO(tschottdorf): Might need to use two requests at some point since we
	// hard-coded the full non-local key range in the header, but that does
	// not take into account the range-local keys. It will be OK as long as
	// we send directly to the Replica, though.
	gcArgs.Keys = append(gcArgs.Keys, txnKeys...)

	// Process push transactions in parallel.
	var wg sync.WaitGroup
	for _, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			continue
		}
		wg.Add(1)
		go pushTxn(repl, now, txn, roachpb.ABORT_TXN, &wg)
	}
	wg.Wait()

	// Resolve all intents.
	var intents []roachpb.Intent
	for id, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			for _, intent := range intentSpanMap[id] {
				intents = append(intents, roachpb.Intent{Span: intent, Txn: *txn})
			}
		}
	}

	if err := repl.resolveIntents(repl.context(), intents, true /* wait */, false /* !poison */); err != nil {
		return err
	}

	// Deal with any leftover sequence cache keys. There shouldn't be many of
	// them.
	gcArgs.Keys = append(gcArgs.Keys, processSequenceCache(repl, now, txnExp, txnMap)...)

	// Send GC request through range.
	gcArgs.GCMeta = *gcMeta

	var ba roachpb.BatchRequest
	// Technically not needed since we're talking directly to the Range.
	ba.RangeID = desc.RangeID
	ba.Timestamp = now
	ba.Add(gcArgs)
	if _, pErr := repl.Send(repl.context(), ba); pErr != nil {
		return pErr.GoError()
	}

	// Store current timestamp as last verification for this replica, as
	// we've just successfully scanned.
	if err := repl.SetLastVerificationTimestamp(now); err != nil {
		log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err)
	}

	return nil
}
Example #8
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold.
func (gcq *gcQueue) process(now proto.Timestamp, repl *Replica) error {
	snap := repl.rm.Engine().NewSnapshot()
	iter := newRangeDataIterator(repl.Desc(), snap)
	defer iter.Close()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	policy, err := gcq.lookupGCPolicy(repl)
	if err != nil {
		return err
	}

	gcMeta := proto.NewGCMetadata(now.WallTime)
	gc := engine.NewGarbageCollector(now, policy)

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()

	gcArgs := &proto.GCRequest{
		RequestHeader: proto.RequestHeader{
			Timestamp: now,
			RangeID:   repl.Desc().RangeID,
		},
	}
	var mu sync.Mutex
	var oldestIntentNanos int64 = math.MaxInt64
	var expBaseKey proto.Key
	var keys []proto.EncodedKey
	var vals [][]byte

	// Maps from txn ID to txn and intent key slice.
	txnMap := map[string]*proto.Transaction{}
	intentMap := map[string][]proto.Key{}

	// updateOldestIntent atomically updates the oldest intent.
	updateOldestIntent := func(intentNanos int64) {
		mu.Lock()
		defer mu.Unlock()
		if intentNanos < oldestIntentNanos {
			oldestIntentNanos = intentNanos
		}
	}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &engine.MVCCMetadata{}
			if err := gogoproto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Keep track of intent to resolve if older than the intent
					// expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						id := string(meta.Txn.ID)
						txnMap[id] = meta.Txn
						intentMap[id] = append(intentMap[id], expBaseKey)
					} else {
						updateOldestIntent(meta.Txn.OrigTimestamp.WallTime)
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(proto.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcArgs.Keys = append(gcArgs.Keys, proto.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through the keys and values of this replica's range.
	for ; iter.Valid(); iter.Next() {
		baseKey, ts, isValue := engine.MVCCDecodeKey(iter.Key())
		if !isValue {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = baseKey
			keys = []proto.EncodedKey{iter.Key()}
			vals = [][]byte{iter.Value()}
		} else {
			if !baseKey.Equal(expBaseKey) {
				log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey)
				continue
			}
			keys = append(keys, iter.Key())
			vals = append(vals, iter.Value())
		}
	}
	if iter.Error() != nil {
		return iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()

	// Set start and end keys.
	switch len(gcArgs.Keys) {
	case 0:
		return nil
	case 1:
		gcArgs.Key = gcArgs.Keys[0].Key
		gcArgs.EndKey = gcArgs.Key.Next()
	default:
		gcArgs.Key = gcArgs.Keys[0].Key
		gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key
	}

	// Process push transactions in parallel.
	var wg sync.WaitGroup
	for _, txn := range txnMap {
		wg.Add(1)
		go gcq.pushTxn(repl, now, txn, updateOldestIntent, &wg)
	}
	wg.Wait()

	// Resolve all intents.
	// TODO(spencer): use a batch here when available.
	for id, txn := range txnMap {
		if txn.Status != proto.PENDING {
			// The transaction was successfully pushed, so resolve the intents.
			for _, key := range intentMap[id] {
				resolveArgs := &proto.ResolveIntentRequest{
					RequestHeader: proto.RequestHeader{
						Timestamp: now,
						Key:       key,
						User:      security.RootUser,
						Txn:       txn,
					},
				}
				if _, err := repl.AddCmd(repl.context(), resolveArgs); err != nil {
					log.Warningf("resolve of key %q failed: %s", key, err)
					updateOldestIntent(txn.OrigTimestamp.WallTime)
				}
			}
		}
	}

	// Send GC request through range.
	gcMeta.OldestIntentNanos = gogoproto.Int64(oldestIntentNanos)
	gcArgs.GCMeta = *gcMeta
	if _, err := repl.AddCmd(repl.context(), gcArgs); err != nil {
		return err
	}

	// Store current timestamp as last verification for this replica, as
	// we've just successfully scanned.
	if err := repl.SetLastVerificationTimestamp(now); err != nil {
		log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err)
	}

	return nil
}
Example #9
0
// TestGCQueueProcess creates test data in the range over various time
// scales and verifies that scan queue process properly GCs test data.
func TestGCQueueProcess(t *testing.T) {
	defer leaktest.AfterTest(t)
	tc := testContext{}
	tc.Start(t)
	defer tc.Stop()

	const now int64 = 48 * 60 * 60 * 1E9 // 2d past the epoch
	tc.manualClock.Set(now)

	ts1 := makeTS(now-2*24*60*60*1E9+1, 0)                     // 2d old (add one nanosecond so we're not using zero timestamp)
	ts2 := makeTS(now-25*60*60*1E9, 0)                         // GC will occur at time=25 hours
	ts3 := makeTS(now-(intentAgeThreshold.Nanoseconds()+1), 0) // 2h+1ns old
	ts4 := makeTS(now-(intentAgeThreshold.Nanoseconds()-1), 0) // 2h-ns old
	ts5 := makeTS(now-1E9, 0)                                  // 1s old
	key1 := proto.Key("a")
	key2 := proto.Key("b")
	key3 := proto.Key("c")
	key4 := proto.Key("d")
	key5 := proto.Key("e")
	key6 := proto.Key("f")
	key7 := proto.Key("g")
	key8 := proto.Key("h")
	key9 := proto.Key("i")

	data := []struct {
		key proto.Key
		ts  proto.Timestamp
		del bool
		txn bool
	}{
		// For key1, we expect first two values to GC.
		{key1, ts1, false, false},
		{key1, ts2, false, false},
		{key1, ts5, false, false},
		// For key2, we expect all values to GC, because most recent is deletion.
		{key2, ts1, false, false},
		{key2, ts2, false, false},
		{key2, ts5, true, false},
		// For key3, we expect just ts1 to GC, because most recent deletion is intent.
		{key3, ts1, false, false},
		{key3, ts2, false, false},
		{key3, ts5, true, true},
		// For key4, expect oldest value to GC.
		{key4, ts1, false, false},
		{key4, ts2, false, false},
		// For key5, expect all values to GC (most recent value deleted).
		{key5, ts1, false, false},
		{key5, ts2, true, false},
		// For key6, expect no values to GC because most recent value is intent.
		{key6, ts1, false, false},
		{key6, ts5, true, true},
		// For key7, expect no values to GC because intent is exactly 2h old.
		{key7, ts2, false, false},
		{key7, ts4, true, true},
		// For key8, expect most recent value to resolve by aborting, which will clean it up.
		{key8, ts2, false, false},
		{key8, ts3, true, true},
		// /For key9, resolve naked intent with no remaining values.
		{key9, ts3, true, false},
	}

	for i, datum := range data {
		if datum.del {
			dArgs, dReply := deleteArgs(datum.key, tc.rng.Desc().RaftID, tc.store.StoreID())
			dArgs.Timestamp = datum.ts
			if datum.txn {
				dArgs.Txn = newTransaction("test", datum.key, 1, proto.SERIALIZABLE, tc.clock)
				dArgs.Txn.Timestamp = datum.ts
			}
			if err := tc.rng.AddCmd(tc.rng.context(), proto.Call{Args: dArgs, Reply: dReply}); err != nil {
				t.Fatalf("%d: could not delete data: %s", i, err)
			}
		} else {
			pArgs, pReply := putArgs(datum.key, []byte("value"), tc.rng.Desc().RaftID, tc.store.StoreID())
			pArgs.Timestamp = datum.ts
			if datum.txn {
				pArgs.Txn = newTransaction("test", datum.key, 1, proto.SERIALIZABLE, tc.clock)
				pArgs.Txn.Timestamp = datum.ts
			}
			if err := tc.rng.AddCmd(tc.rng.context(), proto.Call{Args: pArgs, Reply: pReply}); err != nil {
				t.Fatalf("%d: could not put data: %s", i, err)
			}
		}
	}

	// Process through a scan queue.
	gcQ := newGCQueue()
	if err := gcQ.process(tc.clock.Now(), tc.rng); err != nil {
		t.Error(err)
	}

	expKVs := []struct {
		key proto.Key
		ts  proto.Timestamp
	}{
		{key1, proto.ZeroTimestamp},
		{key1, ts5},
		{key3, proto.ZeroTimestamp},
		{key3, ts5},
		{key3, ts2},
		{key4, proto.ZeroTimestamp},
		{key4, ts2},
		{key6, proto.ZeroTimestamp},
		{key6, ts5},
		{key6, ts1},
		{key7, proto.ZeroTimestamp},
		{key7, ts4},
		{key7, ts2},
		{key8, proto.ZeroTimestamp},
		{key8, ts2},
	}
	// Read data directly from engine to avoid intent errors from MVCC.
	kvs, err := engine.Scan(tc.store.Engine(), engine.MVCCEncodeKey(key1), engine.MVCCEncodeKey(proto.KeyMax), 0)
	if err != nil {
		t.Fatal(err)
	}
	for i, kv := range kvs {
		if key, ts, isValue := engine.MVCCDecodeKey(kv.Key); isValue {
			if log.V(1) {
				log.Infof("%d: %q, ts=%s", i, key, ts)
			}
		} else {
			if log.V(1) {
				log.Infof("%d: %q meta", i, key)
			}
		}
	}
	if len(kvs) != len(expKVs) {
		t.Fatalf("expected length %d; got %d", len(expKVs), len(kvs))
	}
	for i, kv := range kvs {
		key, ts, isValue := engine.MVCCDecodeKey(kv.Key)
		if !key.Equal(expKVs[i].key) {
			t.Errorf("%d: expected key %q; got %q", i, expKVs[i].key, key)
		}
		if !ts.Equal(expKVs[i].ts) {
			t.Errorf("%d: expected ts=%s; got %s", i, expKVs[i].ts, ts)
		}
		if isValue {
			if log.V(1) {
				log.Infof("%d: %q, ts=%s", i, key, ts)
			}
		} else {
			if log.V(1) {
				log.Infof("%d: %q meta", i, key)
			}
		}
	}

	// Verify the oldest extant intent age.
	gcMeta, err := tc.rng.GetGCMetadata()
	if err != nil {
		t.Fatal(err)
	}
	if gcMeta.LastScanNanos != now {
		t.Errorf("expected last scan nanos=%d; got %d", now, gcMeta.LastScanNanos)
	}
	if *gcMeta.OldestIntentNanos != ts4.WallTime {
		t.Errorf("expected oldest intent nanos=%d; got %d", ts4.WallTime, gcMeta.OldestIntentNanos)
	}

	// Verify that the last verification timestamp was updated as whole range was scanned.
	ts, err := tc.rng.GetLastVerificationTimestamp()
	if err != nil {
		t.Fatal(err)
	}
	if gcMeta.LastScanNanos != ts.WallTime {
		t.Errorf("expected walltime nanos %d; got %d", gcMeta.LastScanNanos, ts.WallTime)
	}
}
Example #10
0
// TestGCQueueIntentResolution verifies intent resolution with many
// intents spanning just two transactions.
func TestGCQueueIntentResolution(t *testing.T) {
	defer leaktest.AfterTest(t)
	tc := testContext{}
	tc.Start(t)
	defer tc.Stop()

	const now int64 = 48 * 60 * 60 * 1E9 // 2d past the epoch
	tc.manualClock.Set(now)

	txns := []*roachpb.Transaction{
		newTransaction("txn1", roachpb.Key("0-00000"), 1, roachpb.SERIALIZABLE, tc.clock),
		newTransaction("txn2", roachpb.Key("1-00000"), 1, roachpb.SERIALIZABLE, tc.clock),
	}
	intentResolveTS := makeTS(now-intentAgeThreshold.Nanoseconds(), 0)
	txns[0].OrigTimestamp = intentResolveTS
	txns[0].Timestamp = intentResolveTS
	txns[1].OrigTimestamp = intentResolveTS
	txns[1].Timestamp = intentResolveTS

	// Two transactions.
	for i := 0; i < 2; i++ {
		// 5 puts per transaction.
		// TODO(spencerkimball): benchmark with ~50k.
		for j := 0; j < 5; j++ {
			pArgs := putArgs(roachpb.Key(fmt.Sprintf("%d-%05d", i, j)), []byte("value"))
			if _, err := client.SendWrappedWith(tc.Sender(), tc.rng.context(), roachpb.BatchRequest_Header{
				Txn: txns[i],
			}, &pArgs); err != nil {
				t.Fatalf("%d: could not put data: %s", i, err)
			}
		}
	}

	cfg := tc.gossip.GetSystemConfig()
	if cfg == nil {
		t.Fatal("nil config")
	}

	// Process through a scan queue.
	gcQ := newGCQueue(tc.gossip)
	if err := gcQ.process(tc.clock.Now(), tc.rng, cfg); err != nil {
		t.Fatal(err)
	}

	// Iterate through all values to ensure intents have been fully resolved.
	meta := &engine.MVCCMetadata{}
	err := tc.store.Engine().Iterate(engine.MVCCEncodeKey(roachpb.KeyMin), engine.MVCCEncodeKey(roachpb.KeyMax), func(kv roachpb.RawKeyValue) (bool, error) {
		if key, _, isValue, err := engine.MVCCDecodeKey(kv.Key); err != nil {
			return false, err
		} else if !isValue {
			if err := proto.Unmarshal(kv.Value, meta); err != nil {
				return false, err
			}
			if meta.Txn != nil {
				return false, util.Errorf("non-nil Txn after GC for key %s", key)
			}
		}
		return false, nil
	})
	if err != nil {
		t.Fatal(err)
	}
}
// TestRangeDataIterator creates three ranges {"a"-"b" (pre), "b"-"c"
// (main test range), "c"-"d" (post)} and fills each with data. It
// first verifies the contents of the "b"-"c" range, then deletes it
// and verifies it's empty. Finally, it verifies the pre and post
// ranges still contain the expected data.
//
// TODO This test fails since we automatically elect a leader upon
// creation of the group. It's relying on the Raft storage not having written
// anything during the duration of the test.
//
// TODO(tschottdorf): Since leaders are auto-elected upon creating the range,
// the group storage is written to and confuses the iterator test.
// Setting tc.dormantRaft = true isn't enough since there are two more ranges
// added below, and those also get started automatically.
func disabledTestRangeDataIterator(t *testing.T) {
	defer leaktest.AfterTest(t)
	tc := testContext{
		bootstrapMode: bootstrapRangeOnly,
	}
	tc.Start(t)
	defer tc.Stop()

	// See notes in EmptyRange test method for adjustment to descriptor.
	newDesc := *tc.rng.Desc()
	newDesc.StartKey = proto.Key("b")
	newDesc.EndKey = proto.Key("c")
	if err := tc.rng.setDesc(&newDesc); err != nil {
		t.Fatal(err)
	}

	// Create two more ranges, one before the test range and one after.
	preRng := createRange(tc.store, 2, proto.KeyMin, proto.Key("b"))
	if err := tc.store.AddRangeTest(preRng); err != nil {
		t.Fatal(err)
	}
	postRng := createRange(tc.store, 3, proto.Key("c"), proto.KeyMax)
	if err := tc.store.AddRangeTest(postRng); err != nil {
		t.Fatal(err)
	}

	// Create range data for all three ranges.
	preKeys := createRangeData(preRng, t)
	curKeys := createRangeData(tc.rng, t)
	postKeys := createRangeData(postRng, t)

	iter := newRangeDataIterator(tc.rng.Desc(), tc.rng.rm.Engine())
	defer iter.Close()
	i := 0
	for ; iter.Valid(); iter.Next() {
		if err := iter.Error(); err != nil {
			t.Fatal(err)
		}
		if i >= len(curKeys) {
			t.Fatal("there are more keys in the iteration than expected")
		}
		if key := iter.Key(); !key.Equal(curKeys[i]) {
			k1, ts1, _ := engine.MVCCDecodeKey(key)
			k2, ts2, _ := engine.MVCCDecodeKey(curKeys[i])
			t.Errorf("%d: expected %q(%d); got %q(%d)", i, k2, ts2, k1, ts1)
		}
		i++
	}
	if i != len(curKeys) {
		t.Fatal("there are fewer keys in the iteration than expected")
	}

	// Destroy range and verify that its data has been completely cleared.
	if err := tc.rng.Destroy(); err != nil {
		t.Fatal(err)
	}
	iter = newRangeDataIterator(tc.rng.Desc(), tc.rng.rm.Engine())
	defer iter.Close()
	if iter.Valid() {
		t.Errorf("expected empty iteration; got first key %q", iter.Key())
	}

	// Verify the keys in pre & post ranges.
	for _, test := range []struct {
		r    *Range
		keys []proto.EncodedKey
	}{
		{preRng, preKeys},
		{postRng, postKeys},
	} {
		iter = newRangeDataIterator(test.r.Desc(), test.r.rm.Engine())
		defer iter.Close()
		i = 0
		for ; iter.Valid(); iter.Next() {
			k1, ts1, _ := engine.MVCCDecodeKey(iter.Key())
			if bytes.HasPrefix(k1, keys.ConfigAccountingPrefix) ||
				bytes.HasPrefix(k1, keys.ConfigPermissionPrefix) ||
				bytes.HasPrefix(k1, keys.ConfigZonePrefix) ||
				bytes.HasPrefix(k1, keys.StatusPrefix) {
				// Some data is written into the system prefix by Store.BootstrapRange,
				// but it is not in our expected key list so skip it.
				// TODO(bdarnell): validate this data instead of skipping it.
				continue
			}
			if key := iter.Key(); !key.Equal(test.keys[i]) {
				k2, ts2, _ := engine.MVCCDecodeKey(test.keys[i])
				t.Errorf("%d: key mismatch %q(%d) != %q(%d)", i, k1, ts1, k2, ts2)
			}
			i++
		}
		if i != len(curKeys) {
			t.Fatal("there are fewer keys in the iteration than expected")
		}
	}
}
Example #12
0
// process iterates through all keys in a range, calling the garbage
// collector for each key and associated set of values. GC'd keys are
// batched into InternalGC calls. Extant intents are resolved if
// intents are older than intentAgeThreshold.
func (gcq *gcQueue) process(now proto.Timestamp, rng *Range) error {
	snap := rng.rm.Engine().NewSnapshot()
	iter := newRangeDataIterator(rng.Desc(), snap)
	defer iter.Close()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	policy, err := gcq.lookupGCPolicy(rng)
	if err != nil {
		return err
	}

	gcMeta := proto.NewGCMetadata(now.WallTime)
	gc := engine.NewGarbageCollector(now, policy)

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()

	gcArgs := &proto.InternalGCRequest{
		RequestHeader: proto.RequestHeader{
			Timestamp: now,
			RaftID:    rng.Desc().RaftID,
		},
	}
	var mu sync.Mutex
	var oldestIntentNanos int64 = math.MaxInt64
	var wg sync.WaitGroup
	var expBaseKey proto.Key
	var keys []proto.EncodedKey
	var vals [][]byte

	// updateOldestIntent atomically updates the oldest intent.
	updateOldestIntent := func(intentNanos int64) {
		mu.Lock()
		defer mu.Unlock()
		if intentNanos < oldestIntentNanos {
			oldestIntentNanos = intentNanos
		}
	}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &engine.MVCCMetadata{}
			if err := gogoproto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Resolve intent asynchronously in a goroutine if the intent
					// is older than the intent expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						wg.Add(1)
						go gcq.resolveIntent(rng, expBaseKey, meta, updateOldestIntent, &wg)
					} else {
						updateOldestIntent(meta.Timestamp.WallTime)
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(proto.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcArgs.Keys = append(gcArgs.Keys, proto.InternalGCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through this range's keys and values.
	for ; iter.Valid(); iter.Next() {
		baseKey, ts, isValue := engine.MVCCDecodeKey(iter.Key())
		if !isValue {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = baseKey
			keys = []proto.EncodedKey{iter.Key()}
			vals = [][]byte{iter.Value()}
		} else {
			if !baseKey.Equal(expBaseKey) {
				log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey)
				continue
			}
			keys = append(keys, iter.Key())
			vals = append(vals, iter.Value())
		}
	}
	if iter.Error() != nil {
		return iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()

	// Set start and end keys.
	switch len(gcArgs.Keys) {
	case 0:
		return nil
	case 1:
		gcArgs.Key = gcArgs.Keys[0].Key
		gcArgs.EndKey = gcArgs.Key.Next()
	default:
		gcArgs.Key = gcArgs.Keys[0].Key
		gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key
	}

	// Wait for any outstanding intent resolves and set oldest extant intent.
	wg.Wait()
	gcMeta.OldestIntentNanos = gogoproto.Int64(oldestIntentNanos)

	// Send GC request through range.
	gcArgs.GCMeta = *gcMeta
	if err := rng.AddCmd(rng.context(), proto.Call{Args: gcArgs, Reply: &proto.InternalGCResponse{}}); err != nil {
		return err
	}

	// Store current timestamp as last verification for this range, as
	// we've just successfully scanned.
	if err := rng.SetLastVerificationTimestamp(now); err != nil {
		log.Errorf("failed to set last verification timestamp for range %s: %s", rng, err)
	}

	return nil
}