示例#1
0
// process synchronously invokes admin split for each proposed split key.
func (sq *splitQueue) process(now roachpb.Timestamp, rng *Replica,
	sysCfg *config.SystemConfig) error {

	// First handle case of splitting due to zone config maps.
	desc := rng.Desc()
	splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)
	if len(splitKeys) > 0 {
		log.Infof("splitting %s at keys %v", rng, splitKeys)
		for _, splitKey := range splitKeys {
			if err := sq.db.AdminSplit(splitKey.AsRawKey()); err != nil {
				return util.Errorf("unable to split %s at key %q: %s", rng, splitKey, err)
			}
		}
		return nil
	}

	// Next handle case of splitting due to size.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	// FIXME: why is this implementation not the same as the one above?
	if float64(rng.stats.GetSize())/float64(zone.RangeMaxBytes) > 1 {
		log.Infof("splitting %s size=%d max=%d", rng, rng.stats.GetSize(), zone.RangeMaxBytes)
		if _, pErr := client.SendWrapped(rng, rng.context(), &roachpb.AdminSplitRequest{
			Span: roachpb.Span{Key: desc.StartKey.AsRawKey()},
		}); pErr != nil {
			return pErr.GoError()
		}
	}
	return nil
}
示例#2
0
// shouldQueue determines whether a replica should be queued for garbage
// collection, and if so, at what priority. Returns true for shouldQ
// in the event that the cumulative ages of GC'able bytes or extant
// intents exceed thresholds.
func (gcq *gcQueue) shouldQueue(now roachpb.Timestamp, repl *Replica,
	sysCfg *config.SystemConfig) (shouldQ bool, priority float64) {

	desc := repl.Desc()
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Errorf("could not find GC policy for range %s: %s", repl, err)
		return
	}
	policy := zone.GC

	// GC score is the total GC'able bytes age normalized by 1 MB * the replica's TTL in seconds.
	gcScore := float64(repl.stats.GetGCBytesAge(now.WallTime)) / float64(policy.TTLSeconds) / float64(gcByteCountNormalization)

	// Intent score. This computes the average age of outstanding intents
	// and normalizes.
	intentScore := repl.stats.GetAvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1E9)

	// Compute priority.
	if gcScore > 1 {
		priority += gcScore
	}
	if intentScore > 1 {
		priority += intentScore
	}
	shouldQ = priority > 0
	return
}
示例#3
0
func (rq *replicateQueue) shouldQueue(now roachpb.Timestamp, repl *Replica,
	sysCfg config.SystemConfig) (shouldQ bool, priority float64) {

	if repl.needsSplitBySize() {
		// If the range exceeds the split threshold, let that finish
		// first. Ranges must fit in memory on both sender and receiver
		// nodes while being replicated. This supplements the check
		// provided by acceptsUnsplitRanges, which looks at zone config
		// boundaries rather than data size.
		return
	}

	// Find the zone config for this range.
	desc := repl.Desc()
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Error(err)
		return
	}

	action, priority := rq.allocator.ComputeAction(*zone, desc)
	if action != AllocatorNoop {
		return true, priority
	}
	// See if there is a rebalancing opportunity present.
	shouldRebalance := rq.allocator.ShouldRebalance(repl.store.StoreID())
	return shouldRebalance, 0
}
示例#4
0
// shouldQueue determines whether a replica should be queued for garbage
// collection, and if so, at what priority. Returns true for shouldQ
// in the event that the cumulative ages of GC'able bytes or extant
// intents exceed thresholds.
func (*gcQueue) shouldQueue(now hlc.Timestamp, repl *Replica,
	sysCfg config.SystemConfig) (shouldQ bool, priority float64) {
	desc := repl.Desc()
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Errorf(context.TODO(), "could not find zone config for range %s: %s", repl, err)
		return
	}

	ms := repl.GetMVCCStats()
	// GC score is the total GC'able bytes age normalized by 1 MB * the replica's TTL in seconds.
	gcScore := float64(ms.GCByteAge(now.WallTime)) / float64(zone.GC.TTLSeconds) / float64(gcByteCountNormalization)

	// Intent score. This computes the average age of outstanding intents
	// and normalizes.
	intentScore := ms.AvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1E9)

	// Compute priority.
	if gcScore >= considerThreshold {
		priority += gcScore
	}
	if intentScore >= considerThreshold {
		priority += intentScore
	}
	shouldQ = priority > 0
	return
}
示例#5
0
func (rq *replicateQueue) shouldQueue(
	now hlc.Timestamp,
	repl *Replica,
	sysCfg config.SystemConfig,
) (shouldQ bool, priority float64) {
	if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() {
		// If the range exceeds the split threshold, let that finish first.
		// Ranges must fit in memory on both sender and receiver nodes while
		// being replicated. This supplements the check provided by
		// acceptsUnsplitRanges, which looks at zone config boundaries rather
		// than data size.
		//
		// This check is ignored if the split queue is disabled, since in that
		// case, the split will never come.
		return
	}

	// Find the zone config for this range.
	desc := repl.Desc()
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Error(err)
		return
	}

	action, priority := rq.allocator.ComputeAction(*zone, desc)
	if action != AllocatorNoop {
		return true, priority
	}
	// See if there is a rebalancing opportunity present.
	shouldRebalance := rq.allocator.ShouldRebalance(repl.store.StoreID())
	return shouldRebalance, 0
}
示例#6
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold. The transaction and abort cache records are also
// scanned and old entries evicted. During normal operation, both of these
// records are cleaned up when their respective transaction finishes, so the
// amount of work done here is expected to be small.
//
// Some care needs to be taken to avoid cyclic recreation of entries during GC:
// * a Push initiated due to an intent may recreate a transaction entry
// * resolving an intent may write a new abort cache entry
// * obtaining the transaction for a abort cache entry requires a Push
//
// The following order is taken below:
// 1) collect all intents with sufficiently old txn record
// 2) collect these intents' transactions
// 3) scan the transaction table, collecting abandoned or completed txns
// 4) push all of these transactions (possibly recreating entries)
// 5) resolve all intents (unless the txn is still PENDING), which will recreate
//    abort cache entries (but with the txn timestamp; i.e. likely gc'able)
// 6) scan the abort cache table for old entries
// 7) push these transactions (again, recreating txn entries).
// 8) send a GCRequest.
func (gcq *gcQueue) process(
	ctx context.Context,
	now hlc.Timestamp,
	repl *Replica,
	sysCfg config.SystemConfig,
) error {
	snap := repl.store.Engine().NewSnapshot()
	desc := repl.Desc()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return errors.Errorf("could not find zone config for range %s: %s", repl, err)
	}

	gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC,
		func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) {
			pushTxn(gcq.store.DB(), now, txn, typ)
		},
		func(intents []roachpb.Intent, poison bool, wait bool) error {
			return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait)
		})

	if err != nil {
		return err
	}

	gcq.eventLog.VInfof(true, "completed with stats %+v", info)

	var ba roachpb.BatchRequest
	var gcArgs roachpb.GCRequest
	// TODO(tschottdorf): This is one of these instances in which we want
	// to be more careful that the request ends up on the correct Replica,
	// and we might have to worry about mixing range-local and global keys
	// in a batch which might end up spanning Ranges by the time it executes.
	gcArgs.Key = desc.StartKey.AsRawKey()
	gcArgs.EndKey = desc.EndKey.AsRawKey()
	gcArgs.Keys = gcKeys
	gcArgs.Threshold = info.Threshold

	// Technically not needed since we're talking directly to the Range.
	ba.RangeID = desc.RangeID
	ba.Timestamp = now
	ba.Add(&gcArgs)
	if _, pErr := repl.Send(ctx, ba); pErr != nil {
		return pErr.GoError()
	}
	return nil
}
示例#7
0
// process synchronously invokes admin split for each proposed split key.
func (sq *splitQueue) process(
	ctx context.Context,
	now hlc.Timestamp,
	rng *Replica,
	sysCfg config.SystemConfig,
) error {
	// First handle case of splitting due to zone config maps.
	desc := rng.Desc()
	splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)
	if len(splitKeys) > 0 {
		log.Infof("splitting %s at keys %v", rng, splitKeys)
		log.Trace(ctx, fmt.Sprintf("splitting at keys %v", splitKeys))
		for _, splitKey := range splitKeys {
			if err := sq.db.AdminSplit(splitKey.AsRawKey()); err != nil {
				return errors.Errorf("unable to split %s at key %q: %s", rng, splitKey, err)
			}
		}
		return nil
	}

	// Next handle case of splitting due to size.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	size := rng.GetMVCCStats().Total()
	// FIXME: why is this implementation not the same as the one above?
	if float64(size)/float64(zone.RangeMaxBytes) > 1 {
		log.Infof("splitting %s size=%d max=%d", rng, size, zone.RangeMaxBytes)
		log.Trace(ctx, fmt.Sprintf("splitting size=%d max=%d", size, zone.RangeMaxBytes))
		if _, pErr := client.SendWrappedWith(rng, ctx, roachpb.Header{
			Timestamp: now,
		}, &roachpb.AdminSplitRequest{
			Span: roachpb.Span{Key: desc.StartKey.AsRawKey()},
		}); pErr != nil {
			return pErr.GoError()
		}
	}
	return nil
}
示例#8
0
// shouldQueue determines whether a range should be queued for
// splitting. This is true if the range is intersected by a zone config
// prefix or if the range's size in bytes exceeds the limit for the zone.
func (*splitQueue) shouldQueue(now roachpb.Timestamp, rng *Replica,
	sysCfg *config.SystemConfig) (shouldQ bool, priority float64) {

	desc := rng.Desc()
	if len(sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)) > 0 {
		// Set priority to 1 in the event the range is split by zone configs.
		priority = 1
		shouldQ = true
	}

	// Add priority based on the size of range compared to the max
	// size for the zone it's in.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Error(err)
		return
	}

	if ratio := float64(rng.stats.GetSize()) / float64(zone.RangeMaxBytes); ratio > 1 {
		priority += ratio
		shouldQ = true
	}
	return
}
示例#9
0
func (rq *replicateQueue) shouldQueue(now roachpb.Timestamp, repl *Replica,
	sysCfg config.SystemConfig) (shouldQ bool, priority float64) {

	desc := repl.Desc()
	if len(sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)) > 0 {
		// If the replica's range needs splitting, wait until done.
		return
	}

	// Find the zone config for this range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Error(err)
		return
	}

	action, priority := rq.allocator.ComputeAction(*zone, desc)
	if action != AllocatorNoop {
		return true, priority
	}
	// See if there is a rebalancing opportunity present.
	shouldRebalance := rq.allocator.ShouldRebalance(repl.store.StoreID())
	return shouldRebalance, 0
}
示例#10
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold.
func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica,
	sysCfg *config.SystemConfig) error {

	snap := repl.rm.Engine().NewSnapshot()
	desc := repl.Desc()
	iter := newRangeDataIterator(desc, snap)
	defer iter.Close()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return fmt.Errorf("could not find GC policy for range %s: %s", repl, err)
	}
	policy := zone.GC

	gcMeta := roachpb.NewGCMetadata(now.WallTime)
	gc := engine.NewGarbageCollector(now, *policy)

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()

	// TODO(tschottdorf): execution will use a leader-assigned local
	// timestamp to compute intent age. While this should be fine, could
	// consider adding a Now timestamp to GCRequest which would be used
	// instead.
	gcArgs := &roachpb.GCRequest{
		RequestHeader: roachpb.RequestHeader{
			RangeID: desc.RangeID,
		},
	}
	var mu sync.Mutex
	var oldestIntentNanos int64 = math.MaxInt64
	var expBaseKey roachpb.Key
	var keys []roachpb.EncodedKey
	var vals [][]byte

	// Maps from txn ID to txn and intent key slice.
	txnMap := map[string]*roachpb.Transaction{}
	intentMap := map[string][]roachpb.Intent{}

	// updateOldestIntent atomically updates the oldest intent.
	updateOldestIntent := func(intentNanos int64) {
		mu.Lock()
		defer mu.Unlock()
		if intentNanos < oldestIntentNanos {
			oldestIntentNanos = intentNanos
		}
	}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &engine.MVCCMetadata{}
			if err := proto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Keep track of intent to resolve if older than the intent
					// expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						id := string(meta.Txn.ID)
						txnMap[id] = meta.Txn
						intentMap[id] = append(intentMap[id], roachpb.Intent{Key: expBaseKey})
					} else {
						updateOldestIntent(meta.Txn.OrigTimestamp.WallTime)
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through the keys and values of this replica's range.
	for ; iter.Valid(); iter.Next() {
		baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key())
		if err != nil {
			log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err)
			continue
		}
		if !isValue {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = baseKey
			keys = []roachpb.EncodedKey{iter.Key()}
			vals = [][]byte{iter.Value()}
		} else {
			if !baseKey.Equal(expBaseKey) {
				log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey)
				continue
			}
			keys = append(keys, iter.Key())
			vals = append(vals, iter.Value())
		}
	}
	if iter.Error() != nil {
		return iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()

	// Process push transactions in parallel.
	var wg sync.WaitGroup
	for _, txn := range txnMap {
		wg.Add(1)
		go gcq.pushTxn(repl, now, txn, updateOldestIntent, &wg)
	}
	wg.Wait()

	// Resolve all intents.
	var intents []roachpb.Intent
	for id, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			for _, intent := range intentMap[id] {
				intent.Txn = *txn
				intents = append(intents, intent)
			}
		}
	}

	done := true
	if len(intents) > 0 {
		done = false
		repl.resolveIntents(repl.context(), intents)
	}

	// Set start and end keys.
	if len(gcArgs.Keys) > 0 {
		done = false
		gcArgs.Key = gcArgs.Keys[0].Key
		gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key.Next()
	}

	if done {
		return nil
	}

	// Send GC request through range.
	gcMeta.OldestIntentNanos = proto.Int64(oldestIntentNanos)
	gcArgs.GCMeta = *gcMeta
	if _, err := client.SendWrapped(repl, repl.context(), gcArgs); err != nil {
		return err
	}

	// Store current timestamp as last verification for this replica, as
	// we've just successfully scanned.
	if err := repl.SetLastVerificationTimestamp(now); err != nil {
		log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err)
	}

	return nil
}
示例#11
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold. The transaction and sequence cache records are also
// scanned and old entries evicted. During normal operation, both of these
// records are cleaned up when their respective transaction finishes, so the
// amount of work done here is expected to be small.
//
// Some care needs to be taken to avoid cyclic recreation of entries during GC:
// * a Push initiated due to an intent may recreate a transaction entry
// * resolving an intent may write a new sequence cache entry
// * obtaining the transaction for a sequence cache entry requires a Push
//
// The following order is taken below:
// 1) collect all intents with sufficiently old txn record
// 2) collect these intents' transactions
// 3) scan the transaction table, collecting abandoned or completed txns
// 4) push all of these transactions (possibly recreating entries)
// 5) resolve all intents (unless the txn is still PENDING), which will recreate
//    sequence cache entries (but with the txn timestamp; i.e. likely gc'able)
// 6) scan the sequence table for old entries
// 7) push these transactions (again, recreating txn entries).
// 8) send a GCRequest.
func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica,
	sysCfg config.SystemConfig) error {

	snap := repl.store.Engine().NewSnapshot()
	desc := repl.Desc()
	iter := newReplicaDataIterator(desc, snap, true /* replicatedOnly */)
	defer iter.Close()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return util.Errorf("could not find zone config for range %s: %s", repl, err)
	}

	gc := engine.NewGarbageCollector(now, zone.GC)

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()
	txnExp := now
	txnExp.WallTime -= txnCleanupThreshold.Nanoseconds()

	gcArgs := &roachpb.GCRequest{}
	// TODO(tschottdorf): This is one of these instances in which we want
	// to be more careful that the request ends up on the correct Replica,
	// and we might have to worry about mixing range-local and global keys
	// in a batch which might end up spanning Ranges by the time it executes.
	gcArgs.Key = desc.StartKey.AsRawKey()
	gcArgs.EndKey = desc.EndKey.AsRawKey()

	var expBaseKey roachpb.Key
	var keys []engine.MVCCKey
	var vals [][]byte

	// Maps from txn ID to txn and intent key slice.
	txnMap := map[uuid.UUID]*roachpb.Transaction{}
	intentSpanMap := map[uuid.UUID][]roachpb.Span{}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	var intentCount int
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &engine.MVCCMetadata{}
			if err := proto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Keep track of intent to resolve if older than the intent
					// expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						txnID := *meta.Txn.ID
						txn := &roachpb.Transaction{
							TxnMeta: *meta.Txn,
						}
						txnMap[txnID] = txn
						intentCount++
						intentSpanMap[txnID] = append(intentSpanMap[txnID], roachpb.Span{Key: expBaseKey})
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through the keys and values of this replica's range.
	for ; iter.Valid(); iter.Next() {
		iterKey := iter.Key()
		if !iterKey.IsValue() || !iterKey.Key.Equal(expBaseKey) {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = iterKey.Key
			if !iterKey.IsValue() {
				keys = []engine.MVCCKey{iter.Key()}
				vals = [][]byte{iter.Value()}
				continue
			}
			// An implicit metadata.
			keys = []engine.MVCCKey{engine.MakeMVCCMetadataKey(iterKey.Key)}
			// A nil value for the encoded MVCCMetadata. This will unmarshal to an
			// empty MVCCMetadata which is sufficient for processKeysAndValues to
			// determine that there is no intent.
			vals = [][]byte{nil}
		}
		keys = append(keys, iter.Key())
		vals = append(vals, iter.Value())
	}
	if iter.Error() != nil {
		return iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()
	gcq.eventLog.Infof(true, "assembled %d transactions from %d old intents; found %d gc'able keys", len(txnMap), intentCount, len(gcArgs.Keys))

	txnKeys, err := gcq.processTransactionTable(repl, txnMap, txnExp)
	if err != nil {
		return err
	}

	// From now on, all newly added keys are range-local.
	// TODO(tschottdorf): Might need to use two requests at some point since we
	// hard-coded the full non-local key range in the header, but that does
	// not take into account the range-local keys. It will be OK as long as
	// we send directly to the Replica, though.
	gcArgs.Keys = append(gcArgs.Keys, txnKeys...)

	// Process push transactions in parallel.
	var wg sync.WaitGroup
	gcq.eventLog.Infof(true, "pushing %d txns", len(txnMap))
	for _, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			continue
		}
		wg.Add(1)
		go gcq.pushTxn(repl, now, txn, roachpb.PUSH_ABORT, &wg)
	}
	wg.Wait()

	// Resolve all intents.
	var intents []roachpb.Intent
	for txnID, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			for _, intent := range intentSpanMap[txnID] {
				intents = append(intents, roachpb.Intent{Span: intent, Status: txn.Status, Txn: txn.TxnMeta})
			}
		}
	}
	gcq.eventLog.Infof(true, "resolving %d intents", len(intents))

	if pErr := repl.store.intentResolver.resolveIntents(repl.context(), repl, intents,
		true /* wait */, false /* !poison */); pErr != nil {
		return pErr.GoError()
	}

	// Deal with any leftover sequence cache keys. There shouldn't be many of
	// them.
	leftoverSeqCacheKeys := gcq.processSequenceCache(repl, now, txnExp, txnMap)
	gcq.eventLog.Infof(true, "collected %d leftover sequence cache keys", len(leftoverSeqCacheKeys))
	gcArgs.Keys = append(gcArgs.Keys, leftoverSeqCacheKeys...)
	gcq.eventLog.Infof(true, "sending gc request for %d keys", len(gcArgs.Keys))

	var ba roachpb.BatchRequest
	// Technically not needed since we're talking directly to the Range.
	ba.RangeID = desc.RangeID
	ba.Timestamp = now
	ba.Add(gcArgs)
	if _, pErr := repl.Send(repl.context(), ba); pErr != nil {
		return pErr.GoError()
	}

	return nil
}
示例#12
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold.
func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica,
	sysCfg *config.SystemConfig) error {

	snap := repl.store.Engine().NewSnapshot()
	desc := repl.Desc()
	iter := newReplicaDataIterator(desc, snap)
	defer iter.Close()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return fmt.Errorf("could not find GC policy for range %s: %s", repl, err)
	}
	policy := zone.GC

	gcMeta := roachpb.NewGCMetadata(now.WallTime)
	gc := engine.NewGarbageCollector(now, *policy)

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()
	txnExp := now
	txnExp.WallTime -= txnCleanupThreshold.Nanoseconds()

	gcArgs := &roachpb.GCRequest{}
	// TODO(tschottdorf): This is one of these instances in which we want
	// to be more careful that the request ends up on the correct Replica,
	// and we might have to worry about mixing range-local and global keys
	// in a batch which might end up spanning Ranges by the time it executes.
	gcArgs.Key = desc.StartKey.AsRawKey()
	gcArgs.EndKey = desc.EndKey.AsRawKey()

	var expBaseKey roachpb.Key
	var keys []engine.MVCCKey
	var vals [][]byte

	// Maps from txn ID to txn and intent key slice.
	txnMap := map[string]*roachpb.Transaction{}
	intentSpanMap := map[string][]roachpb.Span{}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &engine.MVCCMetadata{}
			if err := proto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Keep track of intent to resolve if older than the intent
					// expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						id := string(meta.Txn.ID)
						txnMap[id] = meta.Txn
						intentSpanMap[id] = append(intentSpanMap[id], roachpb.Span{Key: expBaseKey})
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through the keys and values of this replica's range.
	for ; iter.Valid(); iter.Next() {
		baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key())
		if err != nil {
			log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err)
			continue
		}
		if !isValue {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = baseKey
			keys = []engine.MVCCKey{iter.Key()}
			vals = [][]byte{iter.Value()}
		} else {
			if !baseKey.Equal(expBaseKey) {
				log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey)
				continue
			}
			keys = append(keys, iter.Key())
			vals = append(vals, iter.Value())
		}
	}
	if iter.Error() != nil {
		return iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()

	txnKeys, err := processTransactionTable(repl, txnMap, txnExp)
	if err != nil {
		return err
	}

	// From now on, all newly added keys are range-local.
	// TODO(tschottdorf): Might need to use two requests at some point since we
	// hard-coded the full non-local key range in the header, but that does
	// not take into account the range-local keys. It will be OK as long as
	// we send directly to the Replica, though.
	gcArgs.Keys = append(gcArgs.Keys, txnKeys...)

	// Process push transactions in parallel.
	var wg sync.WaitGroup
	for _, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			continue
		}
		wg.Add(1)
		go pushTxn(repl, now, txn, roachpb.ABORT_TXN, &wg)
	}
	wg.Wait()

	// Resolve all intents.
	var intents []roachpb.Intent
	for id, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			for _, intent := range intentSpanMap[id] {
				intents = append(intents, roachpb.Intent{Span: intent, Txn: *txn})
			}
		}
	}

	if err := repl.resolveIntents(repl.context(), intents, true /* wait */, false /* !poison */); err != nil {
		return err
	}

	// Deal with any leftover sequence cache keys. There shouldn't be many of
	// them.
	gcArgs.Keys = append(gcArgs.Keys, processSequenceCache(repl, now, txnExp, txnMap)...)

	// Send GC request through range.
	gcArgs.GCMeta = *gcMeta

	var ba roachpb.BatchRequest
	// Technically not needed since we're talking directly to the Range.
	ba.RangeID = desc.RangeID
	ba.Timestamp = now
	ba.Add(gcArgs)
	if _, pErr := repl.Send(repl.context(), ba); pErr != nil {
		return pErr.GoError()
	}

	// Store current timestamp as last verification for this replica, as
	// we've just successfully scanned.
	if err := repl.SetLastVerificationTimestamp(now); err != nil {
		log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err)
	}

	return nil
}
示例#13
0
func (rq *replicateQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) error {
	desc := repl.Desc()
	// Find the zone config for this range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	action, _ := rq.allocator.ComputeAction(*zone, desc)

	// Avoid taking action if the range has too many dead replicas to make
	// quorum.
	deadReplicas := rq.allocator.storePool.deadReplicas(desc.Replicas)
	quorum := computeQuorum(len(desc.Replicas))
	liveReplicaCount := len(desc.Replicas) - len(deadReplicas)
	if liveReplicaCount < quorum {
		return util.Errorf("range requires a replication change, but lacks a quorum of live nodes.")
	}

	switch action {
	case AllocatorAdd:
		newStore, err := rq.allocator.AllocateTarget(zone.ReplicaAttrs[0], desc.Replicas, true, nil)
		if err != nil {
			return err
		}
		newReplica := roachpb.ReplicaDescriptor{
			NodeID:  newStore.Node.NodeID,
			StoreID: newStore.StoreID,
		}
		if err = repl.ChangeReplicas(roachpb.ADD_REPLICA, newReplica, desc); err != nil {
			return err
		}
	case AllocatorRemove:
		removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas)
		if err != nil {
			return err
		}
		if err = repl.ChangeReplicas(roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil {
			return err
		}
		// Do not requeue if we removed ourselves.
		if removeReplica.StoreID == repl.store.StoreID() {
			return nil
		}
	case AllocatorRemoveDead:
		if len(deadReplicas) == 0 {
			if log.V(1) {
				log.Warningf("Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl)
			}
			break
		}
		if err = repl.ChangeReplicas(roachpb.REMOVE_REPLICA, deadReplicas[0], desc); err != nil {
			return err
		}
	case AllocatorNoop:
		// The Noop case will result if this replica was queued in order to
		// rebalance. Attempt to find a rebalancing target.
		rebalanceStore := rq.allocator.RebalanceTarget(repl.store.StoreID(), zone.ReplicaAttrs[0], desc.Replicas)
		if rebalanceStore == nil {
			// No action was necessary and no rebalance target was found. Return
			// without re-queuing this replica.
			return nil
		}
		rebalanceReplica := roachpb.ReplicaDescriptor{
			NodeID:  rebalanceStore.Node.NodeID,
			StoreID: rebalanceStore.StoreID,
		}
		if err = repl.ChangeReplicas(roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil {
			return err
		}
	}

	// Enqueue this replica again to see if there are more changes to be made.
	rq.MaybeAdd(repl, rq.clock.Now())
	return nil
}
示例#14
0
func (rq *replicateQueue) process(
	ctx context.Context,
	now hlc.Timestamp,
	repl *Replica,
	sysCfg config.SystemConfig,
) error {
	desc := repl.Desc()
	// Find the zone config for this range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	action, _ := rq.allocator.ComputeAction(zone, desc)

	// Avoid taking action if the range has too many dead replicas to make
	// quorum.
	deadReplicas := rq.allocator.storePool.deadReplicas(repl.RangeID, desc.Replicas)
	quorum := computeQuorum(len(desc.Replicas))
	liveReplicaCount := len(desc.Replicas) - len(deadReplicas)
	if liveReplicaCount < quorum {
		return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.")
	}

	switch action {
	case AllocatorAdd:
		log.Trace(ctx, "adding a new replica")
		newStore, err := rq.allocator.AllocateTarget(zone.ReplicaAttrs[0], desc.Replicas, true)
		if err != nil {
			return err
		}
		newReplica := roachpb.ReplicaDescriptor{
			NodeID:  newStore.Node.NodeID,
			StoreID: newStore.StoreID,
		}

		log.VTracef(1, ctx, "%s: adding replica to %+v due to under-replication", repl, newReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, newReplica, desc); err != nil {
			return err
		}
	case AllocatorRemove:
		log.Trace(ctx, "removing a replica")
		// We require the lease in order to process replicas, so
		// repl.store.StoreID() corresponds to the lease-holder's store ID.
		removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas, repl.store.StoreID())
		if err != nil {
			return err
		}
		log.VTracef(1, ctx, "%s: removing replica %+v due to over-replication", repl, removeReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil {
			return err
		}
		// Do not requeue if we removed ourselves.
		if removeReplica.StoreID == repl.store.StoreID() {
			return nil
		}
	case AllocatorRemoveDead:
		log.Trace(ctx, "removing a dead replica")
		if len(deadReplicas) == 0 {
			if log.V(1) {
				log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl)
			}
			break
		}
		deadReplica := deadReplicas[0]
		log.VTracef(1, ctx, "%s: removing dead replica %+v from store", repl, deadReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil {
			return err
		}
	case AllocatorNoop:
		log.Trace(ctx, "considering a rebalance")
		// The Noop case will result if this replica was queued in order to
		// rebalance. Attempt to find a rebalancing target.
		//
		// We require the lease in order to process replicas, so
		// repl.store.StoreID() corresponds to the lease-holder's store ID.
		rebalanceStore := rq.allocator.RebalanceTarget(
			zone.ReplicaAttrs[0], desc.Replicas, repl.store.StoreID())
		if rebalanceStore == nil {
			log.VTracef(1, ctx, "%s: no suitable rebalance target", repl)
			// No action was necessary and no rebalance target was found. Return
			// without re-queuing this replica.
			return nil
		}
		rebalanceReplica := roachpb.ReplicaDescriptor{
			NodeID:  rebalanceStore.Node.NodeID,
			StoreID: rebalanceStore.StoreID,
		}
		log.VTracef(1, ctx, "%s: rebalancing to %+v", repl, rebalanceReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil {
			return err
		}
	}

	// Enqueue this replica again to see if there are more changes to be made.
	rq.MaybeAdd(repl, rq.clock.Now())
	return nil
}