// process synchronously invokes admin split for each proposed split key. func (sq *splitQueue) process( ctx context.Context, now hlc.Timestamp, r *Replica, sysCfg config.SystemConfig, ) error { // First handle case of splitting due to zone config maps. desc := r.Desc() splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey) if len(splitKeys) > 0 { log.Infof(ctx, "splitting at keys %v", splitKeys) for _, splitKey := range splitKeys { if err := sq.db.AdminSplit(ctx, splitKey.AsRawKey()); err != nil { return errors.Errorf("unable to split %s at key %q: %s", r, splitKey, err) } } return nil } // Next handle case of splitting due to size. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } size := r.GetMVCCStats().Total() // FIXME: why is this implementation not the same as the one above? if float64(size)/float64(zone.RangeMaxBytes) > 1 { log.Infof(ctx, "splitting size=%d max=%d", size, zone.RangeMaxBytes) if _, pErr := client.SendWrappedWith(ctx, r, roachpb.Header{ Timestamp: now, }, &roachpb.AdminSplitRequest{ Span: roachpb.Span{Key: desc.StartKey.AsRawKey()}, }); pErr != nil { return pErr.GoError() } } return nil }
// shouldQueue determines whether a replica should be queued for garbage // collection, and if so, at what priority. Returns true for shouldQ // in the event that the cumulative ages of GC'able bytes or extant // intents exceed thresholds. func (gcq *gcQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Errorf(ctx, "could not find zone config for range %s: %s", repl, err) return } ms := repl.GetMVCCStats() // GC score is the total GC'able bytes age normalized by 1 MB * the replica's TTL in seconds. gcScore := float64(ms.GCByteAge(now.WallTime)) / float64(zone.GC.TTLSeconds) / float64(gcByteCountNormalization) // Intent score. This computes the average age of outstanding intents // and normalizes. intentScore := ms.AvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1E9) // Compute priority. if gcScore >= considerThreshold { priority += gcScore } if intentScore >= considerThreshold { priority += intentScore } shouldQ = priority > 0 return }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(ctx, gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } log.VEventf(ctx, 1, "completed with stats %+v", info) info.updateMetrics(gcq.store.metrics) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold gcArgs.TxnSpanGCThreshold = info.TxnSpanGCThreshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { log.ErrEvent(ctx, pErr.String()) return pErr.GoError() } return nil }
func (rq *replicateQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() { // If the range exceeds the split threshold, let that finish first. // Ranges must fit in memory on both sender and receiver nodes while // being replicated. This supplements the check provided by // acceptsUnsplitRanges, which looks at zone config boundaries rather // than data size. // // This check is ignored if the split queue is disabled, since in that // case, the split will never come. return } // Find the zone config for this range. desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(ctx, err) return } action, priority := rq.allocator.ComputeAction(zone, desc) if action != AllocatorNoop { if log.V(2) { log.Infof(ctx, "%s repair needed (%s), enqueuing", repl, action) } return true, priority } // See if there is a rebalancing opportunity present. leaseStoreID := repl.store.StoreID() if lease, _ := repl.getLease(); lease != nil { leaseStoreID = lease.Replica.StoreID } target := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, leaseStoreID, desc.RangeID, ) if log.V(2) { if target != nil { log.Infof(ctx, "%s rebalance target found, enqueuing", repl) } else { log.Infof(ctx, "%s no rebalance target found, not enqueuing", repl) } } return target != nil, 0 }
// process synchronously invokes admin split for each proposed split key. func (sq *splitQueue) process( ctx context.Context, now hlc.Timestamp, r *Replica, sysCfg config.SystemConfig, ) error { // First handle case of splitting due to zone config maps. desc := r.Desc() splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey) if len(splitKeys) > 0 { log.Infof(ctx, "splitting at keys %v", splitKeys) for _, splitKey := range splitKeys { if _, pErr := r.adminSplitWithDescriptor( ctx, roachpb.AdminSplitRequest{ SplitKey: splitKey.AsRawKey(), }, desc, ); pErr != nil { return errors.Wrapf(pErr.GoError(), "unable to split %s at key %q", r, splitKey) } } return nil } // Next handle case of splitting due to size. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } size := r.GetMVCCStats().Total() if float64(size)/float64(zone.RangeMaxBytes) > 1 { log.Infof(ctx, "splitting size=%d max=%d", size, zone.RangeMaxBytes) if _, pErr := r.adminSplitWithDescriptor( ctx, roachpb.AdminSplitRequest{}, desc, ); pErr != nil { return pErr.GoError() } } return nil }
// shouldQueue determines whether a range should be queued for // splitting. This is true if the range is intersected by a zone config // prefix or if the range's size in bytes exceeds the limit for the zone. func (sq *splitQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { desc := repl.Desc() if len(sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)) > 0 { // Set priority to 1 in the event the range is split by zone configs. priority = 1 shouldQ = true } // Add priority based on the size of range compared to the max // size for the zone it's in. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(ctx, err) return } if ratio := float64(repl.GetMVCCStats().Total()) / float64(zone.RangeMaxBytes); ratio > 1 { priority += ratio shouldQ = true } return }
func (rq *replicateQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() { // If the range exceeds the split threshold, let that finish first. // Ranges must fit in memory on both sender and receiver nodes while // being replicated. This supplements the check provided by // acceptsUnsplitRanges, which looks at zone config boundaries rather // than data size. // // This check is ignored if the split queue is disabled, since in that // case, the split will never come. return } // Find the zone config for this range. desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(ctx, err) return } action, priority := rq.allocator.ComputeAction(zone, desc) if action != AllocatorNoop { if log.V(2) { log.Infof(ctx, "%s repair needed (%s), enqueuing", repl, action) } return true, priority } // If we hold the lease, check to see if we should transfer it. var leaseStoreID roachpb.StoreID if lease, _ := repl.getLease(); lease != nil && lease.Covers(now) { leaseStoreID = lease.Replica.StoreID if rq.canTransferLease() && rq.allocator.ShouldTransferLease(zone.Constraints, leaseStoreID, desc.RangeID) { if log.V(2) { log.Infof(ctx, "%s lease transfer needed, enqueuing", repl) } return true, 0 } } // Check for a rebalancing opportunity. Note that leaseStoreID will be 0 if // the range doesn't currently have a lease which will allow the current // replica to be considered a rebalancing source. target, err := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, leaseStoreID, desc.RangeID, ) if err != nil { log.ErrEventf(ctx, "rebalance target failed: %s", err) return false, 0 } if log.V(2) { if target != nil { log.Infof(ctx, "%s rebalance target found, enqueuing", repl) } else { log.Infof(ctx, "%s no rebalance target found, not enqueuing", repl) } } return target != nil, 0 }
func (rq *replicateQueue) processOneChange( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(desc.RangeID, desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: log.Event(ctx, "adding a new replica") newStore, err := rq.allocator.AllocateTarget( zone.Constraints, desc.Replicas, desc.RangeID, true, ) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } log.VEventf(ctx, 1, "adding replica to %+v due to under-replication", newReplica) if err := rq.addReplica(ctx, repl, newReplica, desc); err != nil { return err } case AllocatorRemove: log.Event(ctx, "removing a replica") // If the lease holder (our local store) is an overfull store (in terms of // leases) allow transferring the lease away. leaseHolderStoreID := repl.store.StoreID() if rq.allocator.ShouldTransferLease(zone.Constraints, leaseHolderStoreID, desc.RangeID) { leaseHolderStoreID = 0 } removeReplica, err := rq.allocator.RemoveTarget( zone.Constraints, desc.Replicas, leaseHolderStoreID, ) if err != nil { return err } if removeReplica.StoreID == repl.store.StoreID() { // The local replica was selected as the removal target, but that replica // is the leaseholder, so transfer the lease instead. We don't check that // the current store has too many leases in this case under the // assumption that replica balance is a greater concern. Also note that // AllocatorRemove action takes preference over AllocatorNoop // (rebalancing) which is where lease transfer would otherwise occur. We // need to be able to transfer leases in AllocatorRemove in order to get // out of situations where this store is overfull and yet holds all the // leases. candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas) target := rq.allocator.TransferLeaseTarget( zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID, false /* checkTransferLeaseSource */) if target != (roachpb.ReplicaDescriptor{}) { log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) if err := repl.AdminTransferLease(target.StoreID); err != nil { return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID) } rq.lastLeaseTransfer.Store(timeutil.Now()) // Do not requeue as we transferred our lease away. return nil } } else { log.VEventf(ctx, 1, "removing replica %+v due to over-replication", removeReplica) if err := rq.removeReplica(ctx, repl, removeReplica, desc); err != nil { return err } } case AllocatorRemoveDead: log.Event(ctx, "removing a dead replica") if len(deadReplicas) == 0 { if log.V(1) { log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } deadReplica := deadReplicas[0] log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica) if err := repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil { return err } case AllocatorNoop: // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. log.Event(ctx, "considering a rebalance") if rq.canTransferLease() { // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas) target := rq.allocator.TransferLeaseTarget( zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID, true /* checkTransferLeaseSource */) if target != (roachpb.ReplicaDescriptor{}) { log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) if err := repl.AdminTransferLease(target.StoreID); err != nil { return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID) } rq.lastLeaseTransfer.Store(timeutil.Now()) // Do not requeue as we transferred our lease away. return nil } } rebalanceStore, err := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, repl.store.StoreID(), desc.RangeID, ) if err != nil { log.ErrEventf(ctx, "rebalance target failed %s", err) return nil } if rebalanceStore == nil { log.VEventf(ctx, 1, "no suitable rebalance target") // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } log.VEventf(ctx, 1, "rebalancing to %+v", rebalanceReplica) if err := rq.addReplica(ctx, repl, rebalanceReplica, desc); err != nil { return err } } return nil }
func (rq *replicateQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(desc.RangeID, desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: log.Event(ctx, "adding a new replica") newStore, err := rq.allocator.AllocateTarget( zone.Constraints, desc.Replicas, desc.RangeID, true, ) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } log.VEventf(ctx, 1, "adding replica to %+v due to under-replication", newReplica) if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, newReplica, desc); err != nil { return err } case AllocatorRemove: log.Event(ctx, "removing a replica") // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas, repl.store.StoreID()) if err != nil { return err } log.VEventf(ctx, 1, "removing replica %+v due to over-replication", removeReplica) if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil { return err } // Do not requeue if we removed ourselves. if removeReplica.StoreID == repl.store.StoreID() { return nil } case AllocatorRemoveDead: log.Event(ctx, "removing a dead replica") if len(deadReplicas) == 0 { if log.V(1) { log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } deadReplica := deadReplicas[0] log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica) if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil { return err } case AllocatorNoop: log.Event(ctx, "considering a rebalance") // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. // // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. rebalanceStore := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, repl.store.StoreID(), desc.RangeID, ) if rebalanceStore == nil { log.VEventf(ctx, 1, "no suitable rebalance target") // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } log.VEventf(ctx, 1, "rebalancing to %+v", rebalanceReplica) if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil { return err } } // Enqueue this replica again to see if there are more changes to be made. rq.MaybeAdd(repl, rq.clock.Now()) return nil }