// process synchronously invokes admin split for each proposed split key. func (sq *splitQueue) process( ctx context.Context, now hlc.Timestamp, r *Replica, sysCfg config.SystemConfig, ) error { // First handle case of splitting due to zone config maps. desc := r.Desc() splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey) if len(splitKeys) > 0 { log.Infof(ctx, "splitting at keys %v", splitKeys) for _, splitKey := range splitKeys { if err := sq.db.AdminSplit(ctx, splitKey.AsRawKey()); err != nil { return errors.Errorf("unable to split %s at key %q: %s", r, splitKey, err) } } return nil } // Next handle case of splitting due to size. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } size := r.GetMVCCStats().Total() // FIXME: why is this implementation not the same as the one above? if float64(size)/float64(zone.RangeMaxBytes) > 1 { log.Infof(ctx, "splitting size=%d max=%d", size, zone.RangeMaxBytes) if _, pErr := client.SendWrappedWith(ctx, r, roachpb.Header{ Timestamp: now, }, &roachpb.AdminSplitRequest{ Span: roachpb.Span{Key: desc.StartKey.AsRawKey()}, }); pErr != nil { return pErr.GoError() } } return nil }
// GetZoneConfig returns the zone config for the object with 'id'. func GetZoneConfig(cfg config.SystemConfig, id uint32) (config.ZoneConfig, bool, error) { // Look in the zones table. if zoneVal := cfg.GetValue(sqlbase.MakeZoneKey(sqlbase.ID(id))); zoneVal != nil { // We're done. zone, err := config.MigrateZoneConfig(zoneVal) return zone, true, err } // No zone config for this ID. We need to figure out if it's a database // or table. Lookup its descriptor. if descVal := cfg.GetValue(sqlbase.MakeDescMetadataKey(sqlbase.ID(id))); descVal != nil { // Determine whether this is a database or table. var desc sqlbase.Descriptor if err := descVal.GetProto(&desc); err != nil { return config.ZoneConfig{}, false, err } if tableDesc := desc.GetTable(); tableDesc != nil { // This is a table descriptor. Lookup its parent database zone config. return GetZoneConfig(cfg, uint32(tableDesc.ParentID)) } } // Retrieve the default zone config, but only as long as that wasn't the ID // we were trying to retrieve (avoid infinite recursion). if id != keys.RootNamespaceID { return GetZoneConfig(cfg, keys.RootNamespaceID) } // No descriptor or not a table. return config.ZoneConfig{}, false, nil }
// shouldQueue determines whether a replica should be queued for garbage // collection, and if so, at what priority. Returns true for shouldQ // in the event that the cumulative ages of GC'able bytes or extant // intents exceed thresholds. func (gcq *gcQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Errorf(ctx, "could not find zone config for range %s: %s", repl, err) return } ms := repl.GetMVCCStats() // GC score is the total GC'able bytes age normalized by 1 MB * the replica's TTL in seconds. gcScore := float64(ms.GCByteAge(now.WallTime)) / float64(zone.GC.TTLSeconds) / float64(gcByteCountNormalization) // Intent score. This computes the average age of outstanding intents // and normalizes. intentScore := ms.AvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1E9) // Compute priority. if gcScore >= considerThreshold { priority += gcScore } if intentScore >= considerThreshold { priority += intentScore } shouldQ = priority > 0 return }
// GetTableDesc returns the table descriptor for the table with 'id'. // Returns nil if the descriptor is not present, or is present but is not a // table. func GetTableDesc(cfg config.SystemConfig, id sqlbase.ID) (*sqlbase.TableDescriptor, error) { if descVal := cfg.GetValue(sqlbase.MakeDescMetadataKey(id)); descVal != nil { desc := &sqlbase.Descriptor{} if err := descVal.GetProto(desc); err != nil { return nil, err } return desc.GetTable(), nil } return nil, nil }
func (bq *baseQueue) requiresSplit(cfg config.SystemConfig, repl *Replica) bool { if bq.acceptsUnsplitRanges { return false } // If there's no store (as is the case in some narrow unit tests), // the "required" split will never come. In that case, pretend we // don't require the split. if store := repl.store; store == nil { return false } desc := repl.Desc() return cfg.NeedsSplit(desc.StartKey, desc.EndKey) }
func isDeleted(tableID sqlbase.ID, cfg config.SystemConfig) bool { descKey := sqlbase.MakeDescMetadataKey(tableID) val := cfg.GetValue(descKey) if val == nil { return false } var descriptor sqlbase.Descriptor if err := val.GetProto(&descriptor); err != nil { panic("unable to unmarshal table descriptor") } table := descriptor.GetTable() return table.Dropped() }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(ctx, gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } log.VEventf(ctx, 1, "completed with stats %+v", info) info.updateMetrics(gcq.store.metrics) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold gcArgs.TxnSpanGCThreshold = info.TxnSpanGCThreshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { log.ErrEvent(ctx, pErr.String()) return pErr.GoError() } return nil }
func expectDescriptorID(systemConfig config.SystemConfig, idKey roachpb.Key, id sqlbase.ID) error { idValue := systemConfig.GetValue(idKey) if idValue == nil { return errStaleMetadata } cachedID, err := idValue.GetInt() if err != nil { return err } if sqlbase.ID(cachedID) != id { return errStaleMetadata } return nil }
func (rq *replicateQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() { // If the range exceeds the split threshold, let that finish first. // Ranges must fit in memory on both sender and receiver nodes while // being replicated. This supplements the check provided by // acceptsUnsplitRanges, which looks at zone config boundaries rather // than data size. // // This check is ignored if the split queue is disabled, since in that // case, the split will never come. return } // Find the zone config for this range. desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(ctx, err) return } action, priority := rq.allocator.ComputeAction(zone, desc) if action != AllocatorNoop { if log.V(2) { log.Infof(ctx, "%s repair needed (%s), enqueuing", repl, action) } return true, priority } // See if there is a rebalancing opportunity present. leaseStoreID := repl.store.StoreID() if lease, _ := repl.getLease(); lease != nil { leaseStoreID = lease.Replica.StoreID } target := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, leaseStoreID, desc.RangeID, ) if log.V(2) { if target != nil { log.Infof(ctx, "%s rebalance target found, enqueuing", repl) } else { log.Infof(ctx, "%s no rebalance target found, not enqueuing", repl) } } return target != nil, 0 }
func expectDescriptor( systemConfig config.SystemConfig, idKey roachpb.Key, desc *sqlbase.Descriptor, ) error { descValue := systemConfig.GetValue(idKey) if descValue == nil { return errStaleMetadata } var cachedDesc sqlbase.Descriptor if err := descValue.GetProto(&cachedDesc); err != nil { return err } if !proto.Equal(&cachedDesc, desc) { return errStaleMetadata } return nil }
// isRenamed tests if a descriptor is updated by gossip to the specified name // and version. func isRenamed( tableID sqlbase.ID, expectedName string, expectedVersion sqlbase.DescriptorVersion, cfg config.SystemConfig, ) bool { descKey := sqlbase.MakeDescMetadataKey(tableID) val := cfg.GetValue(descKey) if val == nil { return false } var descriptor sqlbase.Descriptor if err := val.GetProto(&descriptor); err != nil { panic("unable to unmarshal table descriptor") } table := descriptor.GetTable() return table.Name == expectedName && table.Version == expectedVersion }
func TestGet(t *testing.T) { defer leaktest.AfterTest(t)() emptyKeys := []roachpb.KeyValue{} someKeys := []roachpb.KeyValue{ plainKV("a", "vala"), plainKV("c", "valc"), plainKV("d", "vald"), } aVal := roachpb.MakeValueFromString("vala") bVal := roachpb.MakeValueFromString("valc") cVal := roachpb.MakeValueFromString("vald") testCases := []struct { values []roachpb.KeyValue key string value *roachpb.Value }{ {emptyKeys, "a", nil}, {emptyKeys, "b", nil}, {emptyKeys, "c", nil}, {emptyKeys, "d", nil}, {emptyKeys, "e", nil}, {someKeys, "", nil}, {someKeys, "b", nil}, {someKeys, "e", nil}, {someKeys, "a0", nil}, {someKeys, "a", &aVal}, {someKeys, "c", &bVal}, {someKeys, "d", &cVal}, } cfg := config.SystemConfig{} for tcNum, tc := range testCases { cfg.Values = tc.values if val := cfg.GetValue([]byte(tc.key)); !proto.Equal(val, tc.value) { t.Errorf("#%d: expected=%s, found=%s", tcNum, tc.value, val) } } }
func waitForConfigChange(t *testing.T, s *server.TestServer) config.SystemConfig { var foundDesc sqlbase.Descriptor var cfg config.SystemConfig testutils.SucceedsSoon(t, func() error { var ok bool if cfg, ok = s.Gossip().GetSystemConfig(); ok { if val := cfg.GetValue(configDescKey); val != nil { if err := val.GetProto(&foundDesc); err != nil { t.Fatal(err) } if id := foundDesc.GetDatabase().GetID(); id != configID { return errors.Errorf("expected database id %d; got %d", configID, id) } return nil } } return errors.Errorf("got nil system config") }) return cfg }
// process synchronously invokes admin split for each proposed split key. func (sq *splitQueue) process( ctx context.Context, now hlc.Timestamp, r *Replica, sysCfg config.SystemConfig, ) error { // First handle case of splitting due to zone config maps. desc := r.Desc() splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey) if len(splitKeys) > 0 { log.Infof(ctx, "splitting at keys %v", splitKeys) for _, splitKey := range splitKeys { if _, pErr := r.adminSplitWithDescriptor( ctx, roachpb.AdminSplitRequest{ SplitKey: splitKey.AsRawKey(), }, desc, ); pErr != nil { return errors.Wrapf(pErr.GoError(), "unable to split %s at key %q", r, splitKey) } } return nil } // Next handle case of splitting due to size. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } size := r.GetMVCCStats().Total() if float64(size)/float64(zone.RangeMaxBytes) > 1 { log.Infof(ctx, "splitting size=%d max=%d", size, zone.RangeMaxBytes) if _, pErr := r.adminSplitWithDescriptor( ctx, roachpb.AdminSplitRequest{}, desc, ); pErr != nil { return pErr.GoError() } } return nil }
// shouldQueue determines whether a range should be queued for // splitting. This is true if the range is intersected by a zone config // prefix or if the range's size in bytes exceeds the limit for the zone. func (sq *splitQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { desc := repl.Desc() if len(sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)) > 0 { // Set priority to 1 in the event the range is split by zone configs. priority = 1 shouldQ = true } // Add priority based on the size of range compared to the max // size for the zone it's in. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(ctx, err) return } if ratio := float64(repl.GetMVCCStats().Total()) / float64(zone.RangeMaxBytes); ratio > 1 { priority += ratio shouldQ = true } return }
func TestGetLargestID(t *testing.T) { defer leaktest.AfterTest(t)() testCases := []struct { values []roachpb.KeyValue largest uint32 maxID uint32 errStr string }{ // No data. {nil, 0, 0, "descriptor table not found"}, // Some data, but not from the system span. {[]roachpb.KeyValue{plainKV("a", "b")}, 0, 0, "descriptor table not found"}, // Some real data, but no descriptors. {[]roachpb.KeyValue{ sqlKV(keys.NamespaceTableID, 1, 1), sqlKV(keys.NamespaceTableID, 1, 2), sqlKV(keys.UsersTableID, 1, 3), }, 0, 0, "descriptor table not found"}, // Single correct descriptor entry. {[]roachpb.KeyValue{sqlKV(keys.DescriptorTableID, 1, 1)}, 1, 0, ""}, // Surrounded by other data. {[]roachpb.KeyValue{ sqlKV(keys.NamespaceTableID, 1, 20), sqlKV(keys.NamespaceTableID, 1, 30), sqlKV(keys.DescriptorTableID, 1, 8), sqlKV(keys.ZonesTableID, 1, 40), }, 8, 0, ""}, // Descriptors with holes. Index ID does not matter. {[]roachpb.KeyValue{ sqlKV(keys.DescriptorTableID, 1, 1), sqlKV(keys.DescriptorTableID, 2, 5), sqlKV(keys.DescriptorTableID, 3, 8), sqlKV(keys.DescriptorTableID, 4, 12), }, 12, 0, ""}, // Real SQL layout. {sqlbase.MakeMetadataSchema().GetInitialValues(), keys.MaxSystemConfigDescID + 4, 0, ""}, // Test non-zero max. {[]roachpb.KeyValue{ sqlKV(keys.DescriptorTableID, 1, 1), sqlKV(keys.DescriptorTableID, 2, 5), sqlKV(keys.DescriptorTableID, 3, 8), sqlKV(keys.DescriptorTableID, 4, 12), }, 8, 8, ""}, // Test non-zero max. {[]roachpb.KeyValue{ sqlKV(keys.DescriptorTableID, 1, 1), sqlKV(keys.DescriptorTableID, 2, 5), sqlKV(keys.DescriptorTableID, 3, 8), sqlKV(keys.DescriptorTableID, 4, 12), }, 5, 7, ""}, } cfg := config.SystemConfig{} for tcNum, tc := range testCases { cfg.Values = tc.values ret, err := cfg.GetLargestObjectID(tc.maxID) if !testutils.IsError(err, tc.errStr) { t.Errorf("#%d: expected err=%q, got %v", tcNum, tc.errStr, err) continue } if err != nil { continue } if ret != tc.largest { t.Errorf("#%d: expected largest=%d, got %d", tcNum, tc.largest, ret) } } }
func (rq *replicateQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(desc.RangeID, desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: log.Event(ctx, "adding a new replica") newStore, err := rq.allocator.AllocateTarget( zone.Constraints, desc.Replicas, desc.RangeID, true, ) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } log.VEventf(ctx, 1, "adding replica to %+v due to under-replication", newReplica) if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, newReplica, desc); err != nil { return err } case AllocatorRemove: log.Event(ctx, "removing a replica") // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas, repl.store.StoreID()) if err != nil { return err } log.VEventf(ctx, 1, "removing replica %+v due to over-replication", removeReplica) if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil { return err } // Do not requeue if we removed ourselves. if removeReplica.StoreID == repl.store.StoreID() { return nil } case AllocatorRemoveDead: log.Event(ctx, "removing a dead replica") if len(deadReplicas) == 0 { if log.V(1) { log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } deadReplica := deadReplicas[0] log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica) if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil { return err } case AllocatorNoop: log.Event(ctx, "considering a rebalance") // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. // // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. rebalanceStore := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, repl.store.StoreID(), desc.RangeID, ) if rebalanceStore == nil { log.VEventf(ctx, 1, "no suitable rebalance target") // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } log.VEventf(ctx, 1, "rebalancing to %+v", rebalanceReplica) if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil { return err } } // Enqueue this replica again to see if there are more changes to be made. rq.MaybeAdd(repl, rq.clock.Now()) return nil }
func expectDeleted(systemConfig config.SystemConfig, key roachpb.Key) error { if systemConfig.GetValue(key) != nil { return errStaleMetadata } return nil }
func (rq *replicateQueue) processOneChange( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(desc.RangeID, desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: log.Event(ctx, "adding a new replica") newStore, err := rq.allocator.AllocateTarget( zone.Constraints, desc.Replicas, desc.RangeID, true, ) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } log.VEventf(ctx, 1, "adding replica to %+v due to under-replication", newReplica) if err := rq.addReplica(ctx, repl, newReplica, desc); err != nil { return err } case AllocatorRemove: log.Event(ctx, "removing a replica") // If the lease holder (our local store) is an overfull store (in terms of // leases) allow transferring the lease away. leaseHolderStoreID := repl.store.StoreID() if rq.allocator.ShouldTransferLease(zone.Constraints, leaseHolderStoreID, desc.RangeID) { leaseHolderStoreID = 0 } removeReplica, err := rq.allocator.RemoveTarget( zone.Constraints, desc.Replicas, leaseHolderStoreID, ) if err != nil { return err } if removeReplica.StoreID == repl.store.StoreID() { // The local replica was selected as the removal target, but that replica // is the leaseholder, so transfer the lease instead. We don't check that // the current store has too many leases in this case under the // assumption that replica balance is a greater concern. Also note that // AllocatorRemove action takes preference over AllocatorNoop // (rebalancing) which is where lease transfer would otherwise occur. We // need to be able to transfer leases in AllocatorRemove in order to get // out of situations where this store is overfull and yet holds all the // leases. candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas) target := rq.allocator.TransferLeaseTarget( zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID, false /* checkTransferLeaseSource */) if target != (roachpb.ReplicaDescriptor{}) { log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) if err := repl.AdminTransferLease(target.StoreID); err != nil { return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID) } rq.lastLeaseTransfer.Store(timeutil.Now()) // Do not requeue as we transferred our lease away. return nil } } else { log.VEventf(ctx, 1, "removing replica %+v due to over-replication", removeReplica) if err := rq.removeReplica(ctx, repl, removeReplica, desc); err != nil { return err } } case AllocatorRemoveDead: log.Event(ctx, "removing a dead replica") if len(deadReplicas) == 0 { if log.V(1) { log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } deadReplica := deadReplicas[0] log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica) if err := repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil { return err } case AllocatorNoop: // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. log.Event(ctx, "considering a rebalance") if rq.canTransferLease() { // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas) target := rq.allocator.TransferLeaseTarget( zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID, true /* checkTransferLeaseSource */) if target != (roachpb.ReplicaDescriptor{}) { log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) if err := repl.AdminTransferLease(target.StoreID); err != nil { return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID) } rq.lastLeaseTransfer.Store(timeutil.Now()) // Do not requeue as we transferred our lease away. return nil } } rebalanceStore, err := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, repl.store.StoreID(), desc.RangeID, ) if err != nil { log.ErrEventf(ctx, "rebalance target failed %s", err) return nil } if rebalanceStore == nil { log.VEventf(ctx, 1, "no suitable rebalance target") // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } log.VEventf(ctx, 1, "rebalancing to %+v", rebalanceReplica) if err := rq.addReplica(ctx, repl, rebalanceReplica, desc); err != nil { return err } } return nil }
func (rq *replicateQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() { // If the range exceeds the split threshold, let that finish first. // Ranges must fit in memory on both sender and receiver nodes while // being replicated. This supplements the check provided by // acceptsUnsplitRanges, which looks at zone config boundaries rather // than data size. // // This check is ignored if the split queue is disabled, since in that // case, the split will never come. return } // Find the zone config for this range. desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(ctx, err) return } action, priority := rq.allocator.ComputeAction(zone, desc) if action != AllocatorNoop { if log.V(2) { log.Infof(ctx, "%s repair needed (%s), enqueuing", repl, action) } return true, priority } // If we hold the lease, check to see if we should transfer it. var leaseStoreID roachpb.StoreID if lease, _ := repl.getLease(); lease != nil && lease.Covers(now) { leaseStoreID = lease.Replica.StoreID if rq.canTransferLease() && rq.allocator.ShouldTransferLease(zone.Constraints, leaseStoreID, desc.RangeID) { if log.V(2) { log.Infof(ctx, "%s lease transfer needed, enqueuing", repl) } return true, 0 } } // Check for a rebalancing opportunity. Note that leaseStoreID will be 0 if // the range doesn't currently have a lease which will allow the current // replica to be considered a rebalancing source. target, err := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, leaseStoreID, desc.RangeID, ) if err != nil { log.ErrEventf(ctx, "rebalance target failed: %s", err) return false, 0 } if log.V(2) { if target != nil { log.Infof(ctx, "%s rebalance target found, enqueuing", repl) } else { log.Infof(ctx, "%s no rebalance target found, not enqueuing", repl) } } return target != nil, 0 }
func TestComputeSplits(t *testing.T) { defer leaktest.AfterTest(t)() const ( start = keys.MaxReservedDescID + 1 reservedStart = keys.MaxSystemConfigDescID + 1 ) schema := sqlbase.MakeMetadataSchema() // Real system tables only. baseSql := schema.GetInitialValues() // Real system tables plus some user stuff. allSql := append(schema.GetInitialValues(), descriptor(start), descriptor(start+1), descriptor(start+5)) sort.Sort(roachpb.KeyValueByKey(allSql)) allUserSplits := []uint32{start, start + 1, start + 2, start + 3, start + 4, start + 5} var allReservedSplits []uint32 for i := 0; i < schema.SystemDescriptorCount()-schema.SystemConfigDescriptorCount(); i++ { allReservedSplits = append(allReservedSplits, reservedStart+uint32(i)) } allSplits := append(allReservedSplits, allUserSplits...) testCases := []struct { values []roachpb.KeyValue start, end roachpb.RKey // Use ints in the testcase definitions, more readable. splits []uint32 }{ // No data. {nil, roachpb.RKeyMin, roachpb.RKeyMax, nil}, {nil, keys.MakeTablePrefix(start), roachpb.RKeyMax, nil}, {nil, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), nil}, {nil, roachpb.RKeyMin, keys.MakeTablePrefix(start + 10), nil}, // Reserved descriptors. {baseSql, roachpb.RKeyMin, roachpb.RKeyMax, allReservedSplits}, {baseSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, nil}, {baseSql, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), nil}, {baseSql, roachpb.RKeyMin, keys.MakeTablePrefix(start + 10), allReservedSplits}, {baseSql, keys.MakeTablePrefix(reservedStart), roachpb.RKeyMax, allReservedSplits[1:]}, {baseSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(start + 10), allReservedSplits[1:]}, {baseSql, roachpb.RKeyMin, keys.MakeTablePrefix(reservedStart + 2), allReservedSplits[:2]}, {baseSql, roachpb.RKeyMin, keys.MakeTablePrefix(reservedStart + 10), allReservedSplits}, {baseSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(reservedStart + 2), allReservedSplits[1:2]}, {baseSql, testutils.MakeKey(keys.MakeTablePrefix(reservedStart), roachpb.RKey("foo")), testutils.MakeKey(keys.MakeTablePrefix(start+10), roachpb.RKey("foo")), allReservedSplits[1:]}, // Reserved + User descriptors. {allSql, keys.MakeTablePrefix(start - 1), roachpb.RKeyMax, allUserSplits}, {allSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, allUserSplits[1:]}, {allSql, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), allUserSplits[1:]}, {allSql, keys.MakeTablePrefix(start - 1), keys.MakeTablePrefix(start + 10), allUserSplits}, {allSql, keys.MakeTablePrefix(start + 4), keys.MakeTablePrefix(start + 10), allUserSplits[5:]}, {allSql, keys.MakeTablePrefix(start + 5), keys.MakeTablePrefix(start + 10), nil}, {allSql, keys.MakeTablePrefix(start + 6), keys.MakeTablePrefix(start + 10), nil}, {allSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), keys.MakeTablePrefix(start + 10), allUserSplits[1:]}, {allSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), keys.MakeTablePrefix(start + 5), allUserSplits[1:5]}, {allSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), testutils.MakeKey(keys.MakeTablePrefix(start+5), roachpb.RKey("bar")), allUserSplits[1:5]}, {allSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("morefoo")), nil}, {allSql, roachpb.RKeyMin, roachpb.RKeyMax, allSplits}, {allSql, keys.MakeTablePrefix(reservedStart + 1), roachpb.RKeyMax, allSplits[2:]}, {allSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(start + 10), allSplits[1:]}, {allSql, roachpb.RKeyMin, keys.MakeTablePrefix(start + 2), allSplits[:6]}, {allSql, testutils.MakeKey(keys.MakeTablePrefix(reservedStart), roachpb.RKey("foo")), testutils.MakeKey(keys.MakeTablePrefix(start+5), roachpb.RKey("foo")), allSplits[1:9]}, } cfg := config.SystemConfig{} for tcNum, tc := range testCases { cfg.Values = tc.values splits := cfg.ComputeSplitKeys(tc.start, tc.end) if len(splits) == 0 && len(tc.splits) == 0 { continue } // Convert ints to actual keys. expected := []roachpb.RKey{} for _, s := range tc.splits { expected = append(expected, keys.MakeRowSentinelKey(keys.MakeTablePrefix(s))) } if !reflect.DeepEqual(splits, expected) { t.Errorf("#%d: bad splits:\ngot: %v\nexpected: %v", tcNum, splits, expected) } } }