func runDebugRaftLog(cmd *cobra.Command, args []string) error { stopper := stop.NewStopper() defer stopper.Stop() if len(args) != 2 { return errors.New("required arguments: dir range_id") } db, err := openStore(cmd, args[0], stopper) if err != nil { return err } rangeID, err := parseRangeID(args[1]) if err != nil { return err } start := engine.MakeMVCCMetadataKey(keys.RaftLogPrefix(rangeID)) end := engine.MakeMVCCMetadataKey(keys.RaftLogPrefix(rangeID).PrefixEnd()) if err := db.Iterate(start, end, printRaftLogEntry); err != nil { return err } return nil }
func loadRangeDescriptor( db engine.Engine, rangeID roachpb.RangeID, ) (roachpb.RangeDescriptor, error) { var desc roachpb.RangeDescriptor handleKV := func(kv engine.MVCCKeyValue) (bool, error) { if kv.Key.Timestamp == hlc.ZeroTimestamp { // We only want values, not MVCCMetadata. return false, nil } if err := checkRangeDescriptorKey(kv.Key); err != nil { // Range descriptor keys are interleaved with others, so if it // doesn't parse as a range descriptor just skip it. return false, nil } if err := getProtoValue(kv.Value, &desc); err != nil { return false, err } return desc.RangeID == rangeID, nil } // Range descriptors are stored by key, so we have to scan over the // range-local data to find the one for this RangeID. start := engine.MakeMVCCMetadataKey(keys.LocalRangePrefix) end := engine.MakeMVCCMetadataKey(keys.LocalRangeMax) if err := db.Iterate(start, end, handleKV); err != nil { return roachpb.RangeDescriptor{}, err } if desc.RangeID == rangeID { return desc, nil } return roachpb.RangeDescriptor{}, fmt.Errorf("range descriptor %d not found", rangeID) }
// CopyFrom copies all the persisted results from the originRangeID // sequence cache into this one. Note that the cache will not be // locked while copying is in progress. Failures decoding individual // entries return an error. The copy is done directly using the engine // instead of interpreting values through MVCC for efficiency. func (sc *SequenceCache) CopyFrom(e engine.Engine, originRangeID roachpb.RangeID) error { originMin := engine.MakeMVCCMetadataKey( keys.SequenceCacheKey(originRangeID, txnIDMin, math.MaxUint32, math.MaxUint32)) originMax := engine.MakeMVCCMetadataKey( keys.SequenceCacheKey(originRangeID, txnIDMax, 0, 0)) return copySeqCache(e, originRangeID, sc.rangeID, originMin, originMax) }
// CopyInto copies all the results from this abort cache into the destRangeID // abort cache. Failures decoding individual cache entries return an error. // On success, returns the number of entries (key-value pairs) copied. func (sc *AbortCache) CopyInto( e engine.Engine, ms *engine.MVCCStats, destRangeID roachpb.RangeID, ) (int, error) { return copySeqCache(e, ms, sc.rangeID, destRangeID, engine.MakeMVCCMetadataKey(sc.min()), engine.MakeMVCCMetadataKey(sc.max())) }
// ClearData removes all persisted items stored in the cache. func (sc *AbortCache) ClearData(e engine.Engine) error { b := e.NewBatch() defer b.Close() _, err := engine.ClearRange(b, engine.MakeMVCCMetadataKey(sc.min()), engine.MakeMVCCMetadataKey(sc.max())) if err != nil { return err } return b.Commit() }
// CopyFrom copies all the persisted results from the originRangeID // abort cache into this one. Note that the cache will not be // locked while copying is in progress. Failures decoding individual // entries return an error. The copy is done directly using the engine // instead of interpreting values through MVCC for efficiency. // On success, returns the number of entries (key-value pairs) copied. func (sc *AbortCache) CopyFrom( ctx context.Context, e engine.Engine, ms *engine.MVCCStats, originRangeID roachpb.RangeID, ) (int, error) { originMin := engine.MakeMVCCMetadataKey(keys.AbortCacheKey(originRangeID, txnIDMin)) originMax := engine.MakeMVCCMetadataKey(keys.AbortCacheKey(originRangeID, txnIDMax)) return copySeqCache(e, ms, originRangeID, sc.rangeID, originMin, originMax) }
func runDebugKeys(cmd *cobra.Command, args []string) error { stopper := stop.NewStopper() defer stopper.Stop() if len(args) != 1 { return errors.New("one argument is required") } db, err := openStore(cmd, args[0], stopper) if err != nil { return err } d := cliContext.debug from := engine.NilKey to := engine.MVCCKeyMax if d.raw { if len(d.startKey) > 0 { from = engine.MakeMVCCMetadataKey(roachpb.Key(d.startKey)) } if len(d.endKey) > 0 { to = engine.MakeMVCCMetadataKey(roachpb.Key(d.endKey)) } } else { if len(d.startKey) > 0 { startKey, err := keys.UglyPrint(d.startKey) if err != nil { return err } from = engine.MakeMVCCMetadataKey(startKey) } if len(d.endKey) > 0 { endKey, err := keys.UglyPrint(d.endKey) if err != nil { return err } to = engine.MakeMVCCMetadataKey(endKey) } } printer := printKey if d.values { printer = printKeyValue } if err := db.Iterate(from, to, printer); err != nil { return err } return nil }
func copySeqCache(e engine.Engine, srcID, dstID roachpb.RangeID, keyMin, keyMax engine.MVCCKey) error { var scratch [64]byte return e.Iterate(keyMin, keyMax, func(kv engine.MVCCKeyValue) (bool, error) { // Decode the key into a cmd, skipping on error. Otherwise, // write it to the corresponding key in the new cache. id, epoch, seq, err := decodeSequenceCacheMVCCKey(kv.Key, scratch[:0]) if err != nil { return false, util.Errorf("could not decode a sequence cache key %s: %s", kv.Key, err) } key := keys.SequenceCacheKey(dstID, id, epoch, seq) encKey := engine.MakeMVCCMetadataKey(key) // Decode the value, update the checksum and re-encode. meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(kv.Value, meta); err != nil { return false, util.Errorf("could not decode sequence cache value %s [% x]: %s", kv.Key, kv.Value, err) } value := meta.Value() value.ClearChecksum() value.InitChecksum(key) meta.RawBytes = value.RawBytes _, _, err = engine.PutProto(e, encKey, meta) return false, err }) }
func runDebugRangeDescriptors(cmd *cobra.Command, args []string) error { stopper := stop.NewStopper() defer stopper.Stop() db, err := openStore(cmd, args, stopper) if err != nil { return err } start := engine.MakeMVCCMetadataKey(keys.LocalRangePrefix) end := engine.MakeMVCCMetadataKey(keys.LocalRangeMax) if err := db.Iterate(start, end, printRangeDescriptor); err != nil { return err } return nil }
func runDebugRangeDescriptors(cmd *cobra.Command, args []string) error { stopper := stop.NewStopper() defer stopper.Stop() if len(args) != 1 { return errors.New("one argument required: dir") } db, err := openStore(cmd, args[0], stopper) if err != nil { return err } start := engine.MakeMVCCMetadataKey(keys.LocalRangePrefix) end := engine.MakeMVCCMetadataKey(keys.LocalRangeMax) return db.Iterate(start, end, printRangeDescriptor) }
func (k *mvccKey) Set(value string) error { var typ keyType var keyStr string i := strings.IndexByte(value, ':') if i == -1 { keyStr = value } else { var err error typ, err = parseKeyType(value[:i]) if err != nil { return err } keyStr = value[i+1:] } switch typ { case raw: *k = mvccKey(engine.MakeMVCCMetadataKey(roachpb.Key(keyStr))) case human: key, err := keys.UglyPrint(keyStr) if err != nil { return err } *k = mvccKey(engine.MakeMVCCMetadataKey(key)) case rangeID: fromID, err := parseRangeID(keyStr) if err != nil { return err } *k = mvccKey(engine.MakeMVCCMetadataKey(keys.MakeRangeIDPrefix(fromID))) default: return fmt.Errorf("unknown key type %s", typ) } return nil }
func copySeqCache( e engine.Engine, ms *engine.MVCCStats, srcID, dstID roachpb.RangeID, keyMin, keyMax engine.MVCCKey, ) (int, error) { var scratch [64]byte var count int var meta engine.MVCCMetadata // TODO(spencer): look into making this an MVCCIteration and writing // the values using MVCC so we can avoid the ugliness of updating // the MVCCStats by hand below. err := e.Iterate(keyMin, keyMax, func(kv engine.MVCCKeyValue) (bool, error) { // Decode the key, skipping on error. Otherwise, write it to the // corresponding key in the new cache. txnID, err := decodeAbortCacheMVCCKey(kv.Key, scratch[:0]) if err != nil { return false, util.Errorf("could not decode an abort cache key %s: %s", kv.Key, err) } key := keys.AbortCacheKey(dstID, txnID) encKey := engine.MakeMVCCMetadataKey(key) // Decode the MVCCMetadata value. if err := proto.Unmarshal(kv.Value, &meta); err != nil { return false, util.Errorf("could not decode mvcc metadata %s [% x]: %s", kv.Key, kv.Value, err) } value := meta.Value() value.ClearChecksum() value.InitChecksum(key) meta.RawBytes = value.RawBytes keyBytes, valBytes, err := engine.PutProto(e, encKey, &meta) if err != nil { return false, err } count++ if ms != nil { ms.SysBytes += keyBytes + valBytes ms.SysCount++ } return false, nil }) return count, err }
func verifyCleanup(key roachpb.Key, coord *TxnCoordSender, eng engine.Engine, t *testing.T) { util.SucceedsWithin(t, 500*time.Millisecond, func() error { coord.Lock() l := len(coord.txns) coord.Unlock() if l != 0 { return fmt.Errorf("expected empty transactions map; got %d", l) } meta := &engine.MVCCMetadata{} ok, _, _, err := eng.GetProto(engine.MakeMVCCMetadataKey(key), meta) if err != nil { return fmt.Errorf("error getting MVCC metadata: %s", err) } if ok && meta.Txn != nil { return fmt.Errorf("found unexpected write intent: %s", meta) } return nil }) }
func makeReplicaKeyRanges(d *roachpb.RangeDescriptor) []keyRange { // The first range in the keyspace starts at KeyMin, which includes the // node-local space. We need the original StartKey to find the range // metadata, but the actual data starts at LocalMax. dataStartKey := d.StartKey.AsRawKey() if d.StartKey.Equal(roachpb.RKeyMin) { dataStartKey = keys.LocalMax } return []keyRange{ { start: engine.MakeMVCCMetadataKey(keys.MakeRangeIDPrefix(d.RangeID)), end: engine.MakeMVCCMetadataKey(keys.MakeRangeIDPrefix(d.RangeID + 1)), }, { start: engine.MakeMVCCMetadataKey(keys.MakeRangeKeyPrefix(d.StartKey)), end: engine.MakeMVCCMetadataKey(keys.MakeRangeKeyPrefix(d.EndKey)), }, { start: engine.MakeMVCCMetadataKey(dataStartKey), end: engine.MakeMVCCMetadataKey(d.EndKey.AsRawKey()), }, } }
// TestGCQueueIntentResolution verifies intent resolution with many // intents spanning just two transactions. func TestGCQueueIntentResolution(t *testing.T) { defer leaktest.AfterTest(t)() tc := testContext{} tc.Start(t) defer tc.Stop() const now int64 = 48 * 60 * 60 * 1E9 // 2d past the epoch tc.manualClock.Set(now) txns := []*roachpb.Transaction{ newTransaction("txn1", roachpb.Key("0-00000"), 1, enginepb.SERIALIZABLE, tc.clock), newTransaction("txn2", roachpb.Key("1-00000"), 1, enginepb.SERIALIZABLE, tc.clock), } intentResolveTS := makeTS(now-intentAgeThreshold.Nanoseconds(), 0) txns[0].OrigTimestamp = intentResolveTS txns[0].Timestamp = intentResolveTS txns[1].OrigTimestamp = intentResolveTS txns[1].Timestamp = intentResolveTS // Two transactions. for i := 0; i < 2; i++ { // 5 puts per transaction. // TODO(spencerkimball): benchmark with ~50k. for j := 0; j < 5; j++ { pArgs := putArgs(roachpb.Key(fmt.Sprintf("%d-%05d", i, j)), []byte("value")) if _, err := tc.SendWrappedWith(roachpb.Header{ Txn: txns[i], }, &pArgs); err != nil { t.Fatalf("%d: could not put data: %s", i, err) } txns[i].Sequence++ } } cfg, ok := tc.gossip.GetSystemConfig() if !ok { t.Fatal("config not set") } // Process through a scan queue. gcQ := newGCQueue(tc.gossip) if err := gcQ.process(tc.clock.Now(), tc.rng, cfg); err != nil { t.Fatal(err) } // Iterate through all values to ensure intents have been fully resolved. meta := &enginepb.MVCCMetadata{} err := tc.store.Engine().Iterate(engine.MakeMVCCMetadataKey(roachpb.KeyMin), engine.MakeMVCCMetadataKey(roachpb.KeyMax), func(kv engine.MVCCKeyValue) (bool, error) { if !kv.Key.IsValue() { if err := proto.Unmarshal(kv.Value, meta); err != nil { return false, err } if meta.Txn != nil { return false, util.Errorf("non-nil Txn after GC for key %s", kv.Key) } } return false, nil }) if err != nil { t.Fatal(err) } }
// TestGCQueueProcess creates test data in the range over various time // scales and verifies that scan queue process properly GCs test data. func TestGCQueueProcess(t *testing.T) { defer leaktest.AfterTest(t)() tc := testContext{} tc.Start(t) defer tc.Stop() const now int64 = 48 * 60 * 60 * 1E9 // 2d past the epoch tc.manualClock.Set(now) ts1 := makeTS(now-2*24*60*60*1E9+1, 0) // 2d old (add one nanosecond so we're not using zero timestamp) ts2 := makeTS(now-25*60*60*1E9, 0) // GC will occur at time=25 hours ts2m1 := ts2.Prev() // ts2 - 1 so we have something not right at the GC time ts3 := makeTS(now-intentAgeThreshold.Nanoseconds(), 0) // 2h old ts4 := makeTS(now-(intentAgeThreshold.Nanoseconds()-1), 0) // 2h-1ns old ts5 := makeTS(now-1E9, 0) // 1s old key1 := roachpb.Key("a") key2 := roachpb.Key("b") key3 := roachpb.Key("c") key4 := roachpb.Key("d") key5 := roachpb.Key("e") key6 := roachpb.Key("f") key7 := roachpb.Key("g") key8 := roachpb.Key("h") key9 := roachpb.Key("i") key10 := roachpb.Key("j") key11 := roachpb.Key("k") data := []struct { key roachpb.Key ts hlc.Timestamp del bool txn bool }{ // For key1, we expect first value to GC. {key1, ts1, false, false}, {key1, ts2, false, false}, {key1, ts5, false, false}, // For key2, we expect values to GC, even though most recent is deletion. {key2, ts1, false, false}, {key2, ts2m1, false, false}, // use a value < the GC time to verify it's kept {key2, ts5, true, false}, // For key3, we expect just ts1 to GC, because most recent deletion is intent. {key3, ts1, false, false}, {key3, ts2, false, false}, {key3, ts5, true, true}, // For key4, expect oldest value to GC. {key4, ts1, false, false}, {key4, ts2, false, false}, // For key5, expect all values to GC (most recent value deleted). {key5, ts1, false, false}, {key5, ts2, true, false}, // deleted, so GC // For key6, expect no values to GC because most recent value is intent. {key6, ts1, false, false}, {key6, ts5, false, true}, // For key7, expect no values to GC because intent is exactly 2h old. {key7, ts2, false, false}, {key7, ts4, false, true}, // For key8, expect most recent value to resolve by aborting, which will clean it up. {key8, ts2, false, false}, {key8, ts3, true, true}, // For key9, resolve naked intent with no remaining values. {key9, ts3, false, true}, // For key10, GC ts1 because it's a delete but not ts3 because it's above the threshold. {key10, ts1, true, false}, {key10, ts3, true, false}, {key10, ts4, false, false}, {key10, ts5, false, false}, // For key11, we can't GC anything because ts1 isn't a delete. {key11, ts1, false, false}, {key11, ts3, true, false}, {key11, ts4, true, false}, {key11, ts5, true, false}, } for i, datum := range data { if datum.del { dArgs := deleteArgs(datum.key) var txn *roachpb.Transaction if datum.txn { txn = newTransaction("test", datum.key, 1, enginepb.SERIALIZABLE, tc.clock) txn.OrigTimestamp = datum.ts txn.Timestamp = datum.ts } if _, err := tc.SendWrappedWith(roachpb.Header{ Timestamp: datum.ts, Txn: txn, }, &dArgs); err != nil { t.Fatalf("%d: could not delete data: %s", i, err) } } else { pArgs := putArgs(datum.key, []byte("value")) var txn *roachpb.Transaction if datum.txn { txn = newTransaction("test", datum.key, 1, enginepb.SERIALIZABLE, tc.clock) txn.OrigTimestamp = datum.ts txn.Timestamp = datum.ts } if _, err := tc.SendWrappedWith(roachpb.Header{ Timestamp: datum.ts, Txn: txn, }, &pArgs); err != nil { t.Fatalf("%d: could not put data: %s", i, err) } } } cfg, ok := tc.gossip.GetSystemConfig() if !ok { t.Fatal("config not set") } // Process through a scan queue. gcQ := newGCQueue(tc.gossip) if err := gcQ.process(tc.clock.Now(), tc.rng, cfg); err != nil { t.Fatal(err) } expKVs := []struct { key roachpb.Key ts hlc.Timestamp }{ {key1, ts5}, {key1, ts2}, {key2, ts5}, {key2, ts2m1}, {key3, hlc.ZeroTimestamp}, {key3, ts5}, {key3, ts2}, {key4, ts2}, {key6, hlc.ZeroTimestamp}, {key6, ts5}, {key6, ts1}, {key7, hlc.ZeroTimestamp}, {key7, ts4}, {key7, ts2}, {key8, ts2}, {key10, ts5}, {key10, ts4}, {key10, ts3}, {key11, ts5}, {key11, ts4}, {key11, ts3}, {key11, ts1}, } // Read data directly from engine to avoid intent errors from MVCC. kvs, err := engine.Scan(tc.store.Engine(), engine.MakeMVCCMetadataKey(key1), engine.MakeMVCCMetadataKey(keys.MaxKey), 0) if err != nil { t.Fatal(err) } for i, kv := range kvs { if log.V(1) { log.Infof("%d: %s", i, kv.Key) } } if len(kvs) != len(expKVs) { t.Fatalf("expected length %d; got %d", len(expKVs), len(kvs)) } for i, kv := range kvs { if !kv.Key.Key.Equal(expKVs[i].key) { t.Errorf("%d: expected key %q; got %q", i, expKVs[i].key, kv.Key.Key) } if !kv.Key.Timestamp.Equal(expKVs[i].ts) { t.Errorf("%d: expected ts=%s; got %s", i, expKVs[i].ts, kv.Key.Timestamp) } if log.V(1) { log.Infof("%d: %s", i, kv.Key) } } // Verify that the last verification timestamp was updated as whole range was scanned. if _, err := tc.rng.getLastVerificationTimestamp(); err != nil { t.Fatal(err) } }
// TestTxnCoordSenderGCWithCancel verifies that the coordinator cleans up extant // transactions and intents after transaction context is cancelled. func TestTxnCoordSenderGCWithCancel(t *testing.T) { defer leaktest.AfterTest(t)() s, sender := createTestDB(t) defer s.Stop() // Set heartbeat interval to 1ms for testing. sender.heartbeatInterval = 1 * time.Millisecond ctx, cancel := context.WithCancel(context.Background()) txn := client.NewTxn(ctx, *s.DB) key := roachpb.Key("a") if pErr := txn.Put(key, []byte("value")); pErr != nil { t.Fatal(pErr) } // Now, advance clock past the default client timeout. // Locking the TxnCoordSender to prevent a data race. sender.Lock() s.Manual.Set(defaultClientTimeout.Nanoseconds() + 1) sender.Unlock() txnID := *txn.Proto.ID // Verify that the transaction is alive despite the timeout having been // exceeded. errStillActive := errors.New("transaction is still active") // TODO(dan): Figure out how to run the heartbeat manually instead of this. if err := util.RetryForDuration(1*time.Second, func() error { // Locking the TxnCoordSender to prevent a data race. sender.Lock() _, ok := sender.txns[txnID] sender.Unlock() if !ok { return nil } meta := &engine.MVCCMetadata{} ok, _, _, err := s.Eng.GetProto(engine.MakeMVCCMetadataKey(key), meta) if err != nil { t.Fatalf("error getting MVCC metadata: %s", err) } if !ok || meta.Txn == nil { return nil } return errStillActive }); err != errStillActive { t.Fatalf("expected transaction to be active, got: %v", err) } // After the context is cancelled, the transaction should be cleaned up. cancel() util.SucceedsSoon(t, func() error { // Locking the TxnCoordSender to prevent a data race. sender.Lock() _, ok := sender.txns[txnID] sender.Unlock() if ok { return util.Errorf("expected garbage collection") } return nil }) verifyCleanup(key, sender, s.Eng, t) }
// ClearData removes all persisted items stored in the cache. func (sc *AbortCache) ClearData(e engine.Engine) error { _, err := engine.ClearRange(e, engine.MakeMVCCMetadataKey(sc.min()), engine.MakeMVCCMetadataKey(sc.max())) return err }
// RunGC runs garbage collection for the specified descriptor on the provided // Engine (which is not mutated). It uses the provided functions pushTxn and // resolveIntents to clarify the true status of and clean up after encountered // transactions. It returns a slice of gc'able keys from the data, transaction, // and abort spans. func RunGC(ctx context.Context, desc *roachpb.RangeDescriptor, snap engine.Engine, now roachpb.Timestamp, policy config.GCPolicy, pushTxn pushFunc, resolveIntents resolveFunc) ([]roachpb.GCRequest_GCKey, GCInfo, error) { iter := newReplicaDataIterator(desc, snap, true /* replicatedOnly */) defer iter.Close() var infoMu = lockableGCInfo{} infoMu.Policy = policy infoMu.Now = now { realResolveIntents := resolveIntents resolveIntents = func(intents []roachpb.Intent, poison bool, wait bool) (err error) { defer func() { infoMu.Lock() infoMu.ResolveTotal += len(intents) if err == nil { infoMu.ResolveSuccess += len(intents) } infoMu.Unlock() }() return realResolveIntents(intents, poison, wait) } realPushTxn := pushTxn pushTxn = func(ts roachpb.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { infoMu.Lock() infoMu.PushTxn++ infoMu.Unlock() realPushTxn(ts, txn, typ) } } gc := engine.MakeGarbageCollector(now, policy) var gcKeys []roachpb.GCRequest_GCKey // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[uuid.UUID]*roachpb.Transaction{} intentSpanMap := map[uuid.UUID][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { txnID := *meta.Txn.ID txn := &roachpb.Transaction{ TxnMeta: *meta.Txn, } txnMap[txnID] = txn infoMu.IntentsConsidered++ intentSpanMap[txnID] = append(intentSpanMap[txnID], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcKeys = append(gcKeys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { iterKey := iter.Key() if !iterKey.IsValue() || !iterKey.Key.Equal(expBaseKey) { // Moving to the next key (& values). processKeysAndValues() expBaseKey = iterKey.Key if !iterKey.IsValue() { keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} continue } // An implicit metadata. keys = []engine.MVCCKey{engine.MakeMVCCMetadataKey(iterKey.Key)} // A nil value for the encoded MVCCMetadata. This will unmarshal to an // empty MVCCMetadata which is sufficient for processKeysAndValues to // determine that there is no intent. vals = [][]byte{nil} } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } if iter.Error() != nil { return nil, GCInfo{}, iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() infoMu.IntentTxns = len(txnMap) infoMu.GCKeys = len(gcKeys) txnKeys, err := processTransactionTable(ctx, snap, desc, txnMap, txnExp, &infoMu, resolveIntents) if err != nil { return nil, GCInfo{}, err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcKeys = append(gcKeys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup sem := make(chan struct{}, gcTaskLimit) for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) sem <- struct{}{} // Avoid passing loop variable into closure. txnCopy := txn go func() { defer func() { <-sem wg.Done() }() pushTxn(now, txnCopy, roachpb.PUSH_ABORT) }() } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for txnID, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[txnID] { intents = append(intents, roachpb.Intent{Span: intent, Status: txn.Status, Txn: txn.TxnMeta}) } } } if err := resolveIntents(intents, true /* wait */, false /* !poison */); err != nil { return nil, GCInfo{}, err } // Clean up the abort cache. gcKeys = append(gcKeys, processAbortCache(ctx, snap, desc.RangeID, now, abortCacheAgeThreshold, &infoMu, pushTxn)...) return gcKeys, infoMu.GCInfo, nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and sequence cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new sequence cache entry // * obtaining the transaction for a sequence cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // sequence cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the sequence table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap, true /* replicatedOnly */) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return util.Errorf("could not find zone config for range %s: %s", repl, err) } gc := engine.NewGarbageCollector(now, zone.GC) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() gcArgs := &roachpb.GCRequest{} // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[uuid.UUID]*roachpb.Transaction{} intentSpanMap := map[uuid.UUID][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. var intentCount int processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { txnID := *meta.Txn.ID txn := &roachpb.Transaction{ TxnMeta: *meta.Txn, } txnMap[txnID] = txn intentCount++ intentSpanMap[txnID] = append(intentSpanMap[txnID], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { iterKey := iter.Key() if !iterKey.IsValue() || !iterKey.Key.Equal(expBaseKey) { // Moving to the next key (& values). processKeysAndValues() expBaseKey = iterKey.Key if !iterKey.IsValue() { keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} continue } // An implicit metadata. keys = []engine.MVCCKey{engine.MakeMVCCMetadataKey(iterKey.Key)} // A nil value for the encoded MVCCMetadata. This will unmarshal to an // empty MVCCMetadata which is sufficient for processKeysAndValues to // determine that there is no intent. vals = [][]byte{nil} } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() gcq.eventLog.Infof(true, "assembled %d transactions from %d old intents; found %d gc'able keys", len(txnMap), intentCount, len(gcArgs.Keys)) txnKeys, err := gcq.processTransactionTable(repl, txnMap, txnExp) if err != nil { return err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcArgs.Keys = append(gcArgs.Keys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup gcq.eventLog.Infof(true, "pushing %d txns", len(txnMap)) for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) go gcq.pushTxn(repl, now, txn, roachpb.PUSH_ABORT, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for txnID, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[txnID] { intents = append(intents, roachpb.Intent{Span: intent, Status: txn.Status, Txn: txn.TxnMeta}) } } } gcq.eventLog.Infof(true, "resolving %d intents", len(intents)) if pErr := repl.store.intentResolver.resolveIntents(repl.context(), repl, intents, true /* wait */, false /* !poison */); pErr != nil { return pErr.GoError() } // Deal with any leftover sequence cache keys. There shouldn't be many of // them. leftoverSeqCacheKeys := gcq.processSequenceCache(repl, now, txnExp, txnMap) gcq.eventLog.Infof(true, "collected %d leftover sequence cache keys", len(leftoverSeqCacheKeys)) gcArgs.Keys = append(gcArgs.Keys, leftoverSeqCacheKeys...) gcq.eventLog.Infof(true, "sending gc request for %d keys", len(gcArgs.Keys)) var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } return nil }
// ClearData removes all persisted items stored in the cache. func (sc *SequenceCache) ClearData(e engine.Engine) error { _, err := engine.ClearRange(e, engine.MakeMVCCMetadataKey(sc.min), engine.MakeMVCCMetadataKey(sc.max)) return err }
// CopyInto copies all the results from this sequence cache into the destRangeID // sequence cache. Failures decoding individual cache entries return an error. func (sc *SequenceCache) CopyInto(e engine.Engine, destRangeID roachpb.RangeID) error { return copySeqCache(e, sc.rangeID, destRangeID, engine.MakeMVCCMetadataKey(sc.min), engine.MakeMVCCMetadataKey(sc.max)) }