// process synchronously invokes admin split for each proposed split key. func (sq *splitQueue) process(now roachpb.Timestamp, rng *Replica, sysCfg *config.SystemConfig) error { // First handle case of splitting due to zone config maps. desc := rng.Desc() splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey) if len(splitKeys) > 0 { log.Infof("splitting %s at keys %v", rng, splitKeys) for _, splitKey := range splitKeys { if err := sq.db.AdminSplit(splitKey.AsRawKey()); err != nil { return util.Errorf("unable to split %s at key %q: %s", rng, splitKey, err) } } return nil } // Next handle case of splitting due to size. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } // FIXME: why is this implementation not the same as the one above? if float64(rng.stats.GetSize())/float64(zone.RangeMaxBytes) > 1 { log.Infof("splitting %s size=%d max=%d", rng, rng.stats.GetSize(), zone.RangeMaxBytes) if _, pErr := client.SendWrapped(rng, rng.context(), &roachpb.AdminSplitRequest{ Span: roachpb.Span{Key: desc.StartKey.AsRawKey()}, }); pErr != nil { return pErr.GoError() } } return nil }
// shouldQueue determines whether a replica should be queued for garbage // collection, and if so, at what priority. Returns true for shouldQ // in the event that the cumulative ages of GC'able bytes or extant // intents exceed thresholds. func (gcq *gcQueue) shouldQueue(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) (shouldQ bool, priority float64) { desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Errorf("could not find GC policy for range %s: %s", repl, err) return } policy := zone.GC // GC score is the total GC'able bytes age normalized by 1 MB * the replica's TTL in seconds. gcScore := float64(repl.stats.GetGCBytesAge(now.WallTime)) / float64(policy.TTLSeconds) / float64(gcByteCountNormalization) // Intent score. This computes the average age of outstanding intents // and normalizes. intentScore := repl.stats.GetAvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1E9) // Compute priority. if gcScore > 1 { priority += gcScore } if intentScore > 1 { priority += intentScore } shouldQ = priority > 0 return }
func (rq *replicateQueue) shouldQueue(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) (shouldQ bool, priority float64) { if repl.needsSplitBySize() { // If the range exceeds the split threshold, let that finish // first. Ranges must fit in memory on both sender and receiver // nodes while being replicated. This supplements the check // provided by acceptsUnsplitRanges, which looks at zone config // boundaries rather than data size. return } // Find the zone config for this range. desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(err) return } action, priority := rq.allocator.ComputeAction(*zone, desc) if action != AllocatorNoop { return true, priority } // See if there is a rebalancing opportunity present. shouldRebalance := rq.allocator.ShouldRebalance(repl.store.StoreID()) return shouldRebalance, 0 }
// shouldQueue determines whether a replica should be queued for garbage // collection, and if so, at what priority. Returns true for shouldQ // in the event that the cumulative ages of GC'able bytes or extant // intents exceed thresholds. func (*gcQueue) shouldQueue(now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig) (shouldQ bool, priority float64) { desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Errorf(context.TODO(), "could not find zone config for range %s: %s", repl, err) return } ms := repl.GetMVCCStats() // GC score is the total GC'able bytes age normalized by 1 MB * the replica's TTL in seconds. gcScore := float64(ms.GCByteAge(now.WallTime)) / float64(zone.GC.TTLSeconds) / float64(gcByteCountNormalization) // Intent score. This computes the average age of outstanding intents // and normalizes. intentScore := ms.AvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1E9) // Compute priority. if gcScore >= considerThreshold { priority += gcScore } if intentScore >= considerThreshold { priority += intentScore } shouldQ = priority > 0 return }
func (rq *replicateQueue) shouldQueue( now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() { // If the range exceeds the split threshold, let that finish first. // Ranges must fit in memory on both sender and receiver nodes while // being replicated. This supplements the check provided by // acceptsUnsplitRanges, which looks at zone config boundaries rather // than data size. // // This check is ignored if the split queue is disabled, since in that // case, the split will never come. return } // Find the zone config for this range. desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(err) return } action, priority := rq.allocator.ComputeAction(*zone, desc) if action != AllocatorNoop { return true, priority } // See if there is a rebalancing opportunity present. shouldRebalance := rq.allocator.ShouldRebalance(repl.store.StoreID()) return shouldRebalance, 0 }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } gcq.eventLog.VInfof(true, "completed with stats %+v", info) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { return pErr.GoError() } return nil }
// process synchronously invokes admin split for each proposed split key. func (sq *splitQueue) process( ctx context.Context, now hlc.Timestamp, rng *Replica, sysCfg config.SystemConfig, ) error { // First handle case of splitting due to zone config maps. desc := rng.Desc() splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey) if len(splitKeys) > 0 { log.Infof("splitting %s at keys %v", rng, splitKeys) log.Trace(ctx, fmt.Sprintf("splitting at keys %v", splitKeys)) for _, splitKey := range splitKeys { if err := sq.db.AdminSplit(splitKey.AsRawKey()); err != nil { return errors.Errorf("unable to split %s at key %q: %s", rng, splitKey, err) } } return nil } // Next handle case of splitting due to size. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } size := rng.GetMVCCStats().Total() // FIXME: why is this implementation not the same as the one above? if float64(size)/float64(zone.RangeMaxBytes) > 1 { log.Infof("splitting %s size=%d max=%d", rng, size, zone.RangeMaxBytes) log.Trace(ctx, fmt.Sprintf("splitting size=%d max=%d", size, zone.RangeMaxBytes)) if _, pErr := client.SendWrappedWith(rng, ctx, roachpb.Header{ Timestamp: now, }, &roachpb.AdminSplitRequest{ Span: roachpb.Span{Key: desc.StartKey.AsRawKey()}, }); pErr != nil { return pErr.GoError() } } return nil }
// shouldQueue determines whether a range should be queued for // splitting. This is true if the range is intersected by a zone config // prefix or if the range's size in bytes exceeds the limit for the zone. func (*splitQueue) shouldQueue(now roachpb.Timestamp, rng *Replica, sysCfg *config.SystemConfig) (shouldQ bool, priority float64) { desc := rng.Desc() if len(sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)) > 0 { // Set priority to 1 in the event the range is split by zone configs. priority = 1 shouldQ = true } // Add priority based on the size of range compared to the max // size for the zone it's in. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(err) return } if ratio := float64(rng.stats.GetSize()) / float64(zone.RangeMaxBytes); ratio > 1 { priority += ratio shouldQ = true } return }
func (rq *replicateQueue) shouldQueue(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) (shouldQ bool, priority float64) { desc := repl.Desc() if len(sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)) > 0 { // If the replica's range needs splitting, wait until done. return } // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(err) return } action, priority := rq.allocator.ComputeAction(*zone, desc) if action != AllocatorNoop { return true, priority } // See if there is a rebalancing opportunity present. shouldRebalance := rq.allocator.ShouldRebalance(repl.store.StoreID()) return shouldRebalance, 0 }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) error { snap := repl.rm.Engine().NewSnapshot() desc := repl.Desc() iter := newRangeDataIterator(desc, snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return fmt.Errorf("could not find GC policy for range %s: %s", repl, err) } policy := zone.GC gcMeta := roachpb.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, *policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() // TODO(tschottdorf): execution will use a leader-assigned local // timestamp to compute intent age. While this should be fine, could // consider adding a Now timestamp to GCRequest which would be used // instead. gcArgs := &roachpb.GCRequest{ RequestHeader: roachpb.RequestHeader{ RangeID: desc.RangeID, }, } var mu sync.Mutex var oldestIntentNanos int64 = math.MaxInt64 var expBaseKey roachpb.Key var keys []roachpb.EncodedKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*roachpb.Transaction{} intentMap := map[string][]roachpb.Intent{} // updateOldestIntent atomically updates the oldest intent. updateOldestIntent := func(intentNanos int64) { mu.Lock() defer mu.Unlock() if intentNanos < oldestIntentNanos { oldestIntentNanos = intentNanos } } // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentMap[id] = append(intentMap[id], roachpb.Intent{Key: expBaseKey}) } else { updateOldestIntent(meta.Txn.OrigTimestamp.WallTime) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err) continue } if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []roachpb.EncodedKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { wg.Add(1) go gcq.pushTxn(repl, now, txn, updateOldestIntent, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for id, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentMap[id] { intent.Txn = *txn intents = append(intents, intent) } } } done := true if len(intents) > 0 { done = false repl.resolveIntents(repl.context(), intents) } // Set start and end keys. if len(gcArgs.Keys) > 0 { done = false gcArgs.Key = gcArgs.Keys[0].Key gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key.Next() } if done { return nil } // Send GC request through range. gcMeta.OldestIntentNanos = proto.Int64(oldestIntentNanos) gcArgs.GCMeta = *gcMeta if _, err := client.SendWrapped(repl, repl.context(), gcArgs); err != nil { return err } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and sequence cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new sequence cache entry // * obtaining the transaction for a sequence cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // sequence cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the sequence table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap, true /* replicatedOnly */) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return util.Errorf("could not find zone config for range %s: %s", repl, err) } gc := engine.NewGarbageCollector(now, zone.GC) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() gcArgs := &roachpb.GCRequest{} // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[uuid.UUID]*roachpb.Transaction{} intentSpanMap := map[uuid.UUID][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. var intentCount int processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { txnID := *meta.Txn.ID txn := &roachpb.Transaction{ TxnMeta: *meta.Txn, } txnMap[txnID] = txn intentCount++ intentSpanMap[txnID] = append(intentSpanMap[txnID], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { iterKey := iter.Key() if !iterKey.IsValue() || !iterKey.Key.Equal(expBaseKey) { // Moving to the next key (& values). processKeysAndValues() expBaseKey = iterKey.Key if !iterKey.IsValue() { keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} continue } // An implicit metadata. keys = []engine.MVCCKey{engine.MakeMVCCMetadataKey(iterKey.Key)} // A nil value for the encoded MVCCMetadata. This will unmarshal to an // empty MVCCMetadata which is sufficient for processKeysAndValues to // determine that there is no intent. vals = [][]byte{nil} } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() gcq.eventLog.Infof(true, "assembled %d transactions from %d old intents; found %d gc'able keys", len(txnMap), intentCount, len(gcArgs.Keys)) txnKeys, err := gcq.processTransactionTable(repl, txnMap, txnExp) if err != nil { return err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcArgs.Keys = append(gcArgs.Keys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup gcq.eventLog.Infof(true, "pushing %d txns", len(txnMap)) for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) go gcq.pushTxn(repl, now, txn, roachpb.PUSH_ABORT, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for txnID, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[txnID] { intents = append(intents, roachpb.Intent{Span: intent, Status: txn.Status, Txn: txn.TxnMeta}) } } } gcq.eventLog.Infof(true, "resolving %d intents", len(intents)) if pErr := repl.store.intentResolver.resolveIntents(repl.context(), repl, intents, true /* wait */, false /* !poison */); pErr != nil { return pErr.GoError() } // Deal with any leftover sequence cache keys. There shouldn't be many of // them. leftoverSeqCacheKeys := gcq.processSequenceCache(repl, now, txnExp, txnMap) gcq.eventLog.Infof(true, "collected %d leftover sequence cache keys", len(leftoverSeqCacheKeys)) gcArgs.Keys = append(gcArgs.Keys, leftoverSeqCacheKeys...) gcq.eventLog.Infof(true, "sending gc request for %d keys", len(gcArgs.Keys)) var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } return nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return fmt.Errorf("could not find GC policy for range %s: %s", repl, err) } policy := zone.GC gcMeta := roachpb.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, *policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() gcArgs := &roachpb.GCRequest{} // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*roachpb.Transaction{} intentSpanMap := map[string][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentSpanMap[id] = append(intentSpanMap[id], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err) continue } if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() txnKeys, err := processTransactionTable(repl, txnMap, txnExp) if err != nil { return err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcArgs.Keys = append(gcArgs.Keys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) go pushTxn(repl, now, txn, roachpb.ABORT_TXN, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for id, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[id] { intents = append(intents, roachpb.Intent{Span: intent, Txn: *txn}) } } } if err := repl.resolveIntents(repl.context(), intents, true /* wait */, false /* !poison */); err != nil { return err } // Deal with any leftover sequence cache keys. There shouldn't be many of // them. gcArgs.Keys = append(gcArgs.Keys, processSequenceCache(repl, now, txnExp, txnMap)...) // Send GC request through range. gcArgs.GCMeta = *gcMeta var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }
func (rq *replicateQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(*zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return util.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: newStore, err := rq.allocator.AllocateTarget(zone.ReplicaAttrs[0], desc.Replicas, true, nil) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } if err = repl.ChangeReplicas(roachpb.ADD_REPLICA, newReplica, desc); err != nil { return err } case AllocatorRemove: removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas) if err != nil { return err } if err = repl.ChangeReplicas(roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil { return err } // Do not requeue if we removed ourselves. if removeReplica.StoreID == repl.store.StoreID() { return nil } case AllocatorRemoveDead: if len(deadReplicas) == 0 { if log.V(1) { log.Warningf("Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } if err = repl.ChangeReplicas(roachpb.REMOVE_REPLICA, deadReplicas[0], desc); err != nil { return err } case AllocatorNoop: // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. rebalanceStore := rq.allocator.RebalanceTarget(repl.store.StoreID(), zone.ReplicaAttrs[0], desc.Replicas) if rebalanceStore == nil { // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } if err = repl.ChangeReplicas(roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil { return err } } // Enqueue this replica again to see if there are more changes to be made. rq.MaybeAdd(repl, rq.clock.Now()) return nil }
func (rq *replicateQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(repl.RangeID, desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: log.Trace(ctx, "adding a new replica") newStore, err := rq.allocator.AllocateTarget(zone.ReplicaAttrs[0], desc.Replicas, true) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } log.VTracef(1, ctx, "%s: adding replica to %+v due to under-replication", repl, newReplica) if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, newReplica, desc); err != nil { return err } case AllocatorRemove: log.Trace(ctx, "removing a replica") // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas, repl.store.StoreID()) if err != nil { return err } log.VTracef(1, ctx, "%s: removing replica %+v due to over-replication", repl, removeReplica) if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil { return err } // Do not requeue if we removed ourselves. if removeReplica.StoreID == repl.store.StoreID() { return nil } case AllocatorRemoveDead: log.Trace(ctx, "removing a dead replica") if len(deadReplicas) == 0 { if log.V(1) { log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } deadReplica := deadReplicas[0] log.VTracef(1, ctx, "%s: removing dead replica %+v from store", repl, deadReplica) if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil { return err } case AllocatorNoop: log.Trace(ctx, "considering a rebalance") // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. // // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. rebalanceStore := rq.allocator.RebalanceTarget( zone.ReplicaAttrs[0], desc.Replicas, repl.store.StoreID()) if rebalanceStore == nil { log.VTracef(1, ctx, "%s: no suitable rebalance target", repl) // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } log.VTracef(1, ctx, "%s: rebalancing to %+v", repl, rebalanceReplica) if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil { return err } } // Enqueue this replica again to see if there are more changes to be made. rq.MaybeAdd(repl, rq.clock.Now()) return nil }