Ejemplo n.º 1
0
// process synchronously invokes admin split for each proposed split key.
func (sq *splitQueue) process(
	ctx context.Context, now hlc.Timestamp, r *Replica, sysCfg config.SystemConfig,
) error {
	// First handle case of splitting due to zone config maps.
	desc := r.Desc()
	splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)
	if len(splitKeys) > 0 {
		log.Infof(ctx, "splitting at keys %v", splitKeys)
		for _, splitKey := range splitKeys {
			if err := sq.db.AdminSplit(ctx, splitKey.AsRawKey()); err != nil {
				return errors.Errorf("unable to split %s at key %q: %s", r, splitKey, err)
			}
		}
		return nil
	}

	// Next handle case of splitting due to size.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	size := r.GetMVCCStats().Total()
	// FIXME: why is this implementation not the same as the one above?
	if float64(size)/float64(zone.RangeMaxBytes) > 1 {
		log.Infof(ctx, "splitting size=%d max=%d", size, zone.RangeMaxBytes)
		if _, pErr := client.SendWrappedWith(ctx, r, roachpb.Header{
			Timestamp: now,
		}, &roachpb.AdminSplitRequest{
			Span: roachpb.Span{Key: desc.StartKey.AsRawKey()},
		}); pErr != nil {
			return pErr.GoError()
		}
	}
	return nil
}
Ejemplo n.º 2
0
// GetZoneConfig returns the zone config for the object with 'id'.
func GetZoneConfig(cfg config.SystemConfig, id uint32) (config.ZoneConfig, bool, error) {
	// Look in the zones table.
	if zoneVal := cfg.GetValue(sqlbase.MakeZoneKey(sqlbase.ID(id))); zoneVal != nil {
		// We're done.
		zone, err := config.MigrateZoneConfig(zoneVal)
		return zone, true, err
	}

	// No zone config for this ID. We need to figure out if it's a database
	// or table. Lookup its descriptor.
	if descVal := cfg.GetValue(sqlbase.MakeDescMetadataKey(sqlbase.ID(id))); descVal != nil {
		// Determine whether this is a database or table.
		var desc sqlbase.Descriptor
		if err := descVal.GetProto(&desc); err != nil {
			return config.ZoneConfig{}, false, err
		}
		if tableDesc := desc.GetTable(); tableDesc != nil {
			// This is a table descriptor. Lookup its parent database zone config.
			return GetZoneConfig(cfg, uint32(tableDesc.ParentID))
		}
	}

	// Retrieve the default zone config, but only as long as that wasn't the ID
	// we were trying to retrieve (avoid infinite recursion).
	if id != keys.RootNamespaceID {
		return GetZoneConfig(cfg, keys.RootNamespaceID)
	}

	// No descriptor or not a table.
	return config.ZoneConfig{}, false, nil
}
Ejemplo n.º 3
0
// shouldQueue determines whether a replica should be queued for garbage
// collection, and if so, at what priority. Returns true for shouldQ
// in the event that the cumulative ages of GC'able bytes or extant
// intents exceed thresholds.
func (gcq *gcQueue) shouldQueue(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) (shouldQ bool, priority float64) {
	desc := repl.Desc()
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Errorf(ctx, "could not find zone config for range %s: %s", repl, err)
		return
	}

	ms := repl.GetMVCCStats()
	// GC score is the total GC'able bytes age normalized by 1 MB * the replica's TTL in seconds.
	gcScore := float64(ms.GCByteAge(now.WallTime)) / float64(zone.GC.TTLSeconds) / float64(gcByteCountNormalization)

	// Intent score. This computes the average age of outstanding intents
	// and normalizes.
	intentScore := ms.AvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1E9)

	// Compute priority.
	if gcScore >= considerThreshold {
		priority += gcScore
	}
	if intentScore >= considerThreshold {
		priority += intentScore
	}
	shouldQ = priority > 0
	return
}
Ejemplo n.º 4
0
// GetTableDesc returns the table descriptor for the table with 'id'.
// Returns nil if the descriptor is not present, or is present but is not a
// table.
func GetTableDesc(cfg config.SystemConfig, id sqlbase.ID) (*sqlbase.TableDescriptor, error) {
	if descVal := cfg.GetValue(sqlbase.MakeDescMetadataKey(id)); descVal != nil {
		desc := &sqlbase.Descriptor{}
		if err := descVal.GetProto(desc); err != nil {
			return nil, err
		}
		return desc.GetTable(), nil
	}
	return nil, nil
}
Ejemplo n.º 5
0
func (bq *baseQueue) requiresSplit(cfg config.SystemConfig, repl *Replica) bool {
	if bq.acceptsUnsplitRanges {
		return false
	}
	// If there's no store (as is the case in some narrow unit tests),
	// the "required" split will never come. In that case, pretend we
	// don't require the split.
	if store := repl.store; store == nil {
		return false
	}
	desc := repl.Desc()
	return cfg.NeedsSplit(desc.StartKey, desc.EndKey)
}
Ejemplo n.º 6
0
func isDeleted(tableID sqlbase.ID, cfg config.SystemConfig) bool {
	descKey := sqlbase.MakeDescMetadataKey(tableID)
	val := cfg.GetValue(descKey)
	if val == nil {
		return false
	}
	var descriptor sqlbase.Descriptor
	if err := val.GetProto(&descriptor); err != nil {
		panic("unable to unmarshal table descriptor")
	}
	table := descriptor.GetTable()
	return table.Dropped()
}
Ejemplo n.º 7
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold. The transaction and abort cache records are also
// scanned and old entries evicted. During normal operation, both of these
// records are cleaned up when their respective transaction finishes, so the
// amount of work done here is expected to be small.
//
// Some care needs to be taken to avoid cyclic recreation of entries during GC:
// * a Push initiated due to an intent may recreate a transaction entry
// * resolving an intent may write a new abort cache entry
// * obtaining the transaction for a abort cache entry requires a Push
//
// The following order is taken below:
// 1) collect all intents with sufficiently old txn record
// 2) collect these intents' transactions
// 3) scan the transaction table, collecting abandoned or completed txns
// 4) push all of these transactions (possibly recreating entries)
// 5) resolve all intents (unless the txn is still PENDING), which will recreate
//    abort cache entries (but with the txn timestamp; i.e. likely gc'able)
// 6) scan the abort cache table for old entries
// 7) push these transactions (again, recreating txn entries).
// 8) send a GCRequest.
func (gcq *gcQueue) process(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) error {
	snap := repl.store.Engine().NewSnapshot()
	desc := repl.Desc()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return errors.Errorf("could not find zone config for range %s: %s", repl, err)
	}

	gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC,
		func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) {
			pushTxn(ctx, gcq.store.DB(), now, txn, typ)
		},
		func(intents []roachpb.Intent, poison bool, wait bool) error {
			return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait)
		})

	if err != nil {
		return err
	}

	log.VEventf(ctx, 1, "completed with stats %+v", info)

	info.updateMetrics(gcq.store.metrics)

	var ba roachpb.BatchRequest
	var gcArgs roachpb.GCRequest
	// TODO(tschottdorf): This is one of these instances in which we want
	// to be more careful that the request ends up on the correct Replica,
	// and we might have to worry about mixing range-local and global keys
	// in a batch which might end up spanning Ranges by the time it executes.
	gcArgs.Key = desc.StartKey.AsRawKey()
	gcArgs.EndKey = desc.EndKey.AsRawKey()
	gcArgs.Keys = gcKeys
	gcArgs.Threshold = info.Threshold
	gcArgs.TxnSpanGCThreshold = info.TxnSpanGCThreshold

	// Technically not needed since we're talking directly to the Range.
	ba.RangeID = desc.RangeID
	ba.Timestamp = now
	ba.Add(&gcArgs)
	if _, pErr := repl.Send(ctx, ba); pErr != nil {
		log.ErrEvent(ctx, pErr.String())
		return pErr.GoError()
	}
	return nil
}
Ejemplo n.º 8
0
func expectDescriptorID(systemConfig config.SystemConfig, idKey roachpb.Key, id sqlbase.ID) error {
	idValue := systemConfig.GetValue(idKey)
	if idValue == nil {
		return errStaleMetadata
	}
	cachedID, err := idValue.GetInt()
	if err != nil {
		return err
	}
	if sqlbase.ID(cachedID) != id {
		return errStaleMetadata
	}
	return nil
}
Ejemplo n.º 9
0
func (rq *replicateQueue) shouldQueue(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) (shouldQ bool, priority float64) {
	if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() {
		// If the range exceeds the split threshold, let that finish first.
		// Ranges must fit in memory on both sender and receiver nodes while
		// being replicated. This supplements the check provided by
		// acceptsUnsplitRanges, which looks at zone config boundaries rather
		// than data size.
		//
		// This check is ignored if the split queue is disabled, since in that
		// case, the split will never come.
		return
	}

	// Find the zone config for this range.
	desc := repl.Desc()
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Error(ctx, err)
		return
	}

	action, priority := rq.allocator.ComputeAction(zone, desc)
	if action != AllocatorNoop {
		if log.V(2) {
			log.Infof(ctx, "%s repair needed (%s), enqueuing", repl, action)
		}
		return true, priority
	}
	// See if there is a rebalancing opportunity present.
	leaseStoreID := repl.store.StoreID()
	if lease, _ := repl.getLease(); lease != nil {
		leaseStoreID = lease.Replica.StoreID
	}
	target := rq.allocator.RebalanceTarget(
		zone.Constraints,
		desc.Replicas,
		leaseStoreID,
		desc.RangeID,
	)
	if log.V(2) {
		if target != nil {
			log.Infof(ctx, "%s rebalance target found, enqueuing", repl)
		} else {
			log.Infof(ctx, "%s no rebalance target found, not enqueuing", repl)
		}
	}
	return target != nil, 0
}
Ejemplo n.º 10
0
func expectDescriptor(
	systemConfig config.SystemConfig, idKey roachpb.Key, desc *sqlbase.Descriptor,
) error {
	descValue := systemConfig.GetValue(idKey)
	if descValue == nil {
		return errStaleMetadata
	}
	var cachedDesc sqlbase.Descriptor
	if err := descValue.GetProto(&cachedDesc); err != nil {
		return err
	}
	if !proto.Equal(&cachedDesc, desc) {
		return errStaleMetadata
	}
	return nil
}
Ejemplo n.º 11
0
// isRenamed tests if a descriptor is updated by gossip to the specified name
// and version.
func isRenamed(
	tableID sqlbase.ID,
	expectedName string,
	expectedVersion sqlbase.DescriptorVersion,
	cfg config.SystemConfig,
) bool {
	descKey := sqlbase.MakeDescMetadataKey(tableID)
	val := cfg.GetValue(descKey)
	if val == nil {
		return false
	}
	var descriptor sqlbase.Descriptor
	if err := val.GetProto(&descriptor); err != nil {
		panic("unable to unmarshal table descriptor")
	}
	table := descriptor.GetTable()
	return table.Name == expectedName && table.Version == expectedVersion
}
Ejemplo n.º 12
0
func TestGet(t *testing.T) {
	defer leaktest.AfterTest(t)()

	emptyKeys := []roachpb.KeyValue{}
	someKeys := []roachpb.KeyValue{
		plainKV("a", "vala"),
		plainKV("c", "valc"),
		plainKV("d", "vald"),
	}

	aVal := roachpb.MakeValueFromString("vala")
	bVal := roachpb.MakeValueFromString("valc")
	cVal := roachpb.MakeValueFromString("vald")

	testCases := []struct {
		values []roachpb.KeyValue
		key    string
		value  *roachpb.Value
	}{
		{emptyKeys, "a", nil},
		{emptyKeys, "b", nil},
		{emptyKeys, "c", nil},
		{emptyKeys, "d", nil},
		{emptyKeys, "e", nil},

		{someKeys, "", nil},
		{someKeys, "b", nil},
		{someKeys, "e", nil},
		{someKeys, "a0", nil},

		{someKeys, "a", &aVal},
		{someKeys, "c", &bVal},
		{someKeys, "d", &cVal},
	}

	cfg := config.SystemConfig{}
	for tcNum, tc := range testCases {
		cfg.Values = tc.values
		if val := cfg.GetValue([]byte(tc.key)); !proto.Equal(val, tc.value) {
			t.Errorf("#%d: expected=%s, found=%s", tcNum, tc.value, val)
		}
	}
}
Ejemplo n.º 13
0
func waitForConfigChange(t *testing.T, s *server.TestServer) config.SystemConfig {
	var foundDesc sqlbase.Descriptor
	var cfg config.SystemConfig
	testutils.SucceedsSoon(t, func() error {
		var ok bool
		if cfg, ok = s.Gossip().GetSystemConfig(); ok {
			if val := cfg.GetValue(configDescKey); val != nil {
				if err := val.GetProto(&foundDesc); err != nil {
					t.Fatal(err)
				}
				if id := foundDesc.GetDatabase().GetID(); id != configID {
					return errors.Errorf("expected database id %d; got %d", configID, id)
				}
				return nil
			}
		}
		return errors.Errorf("got nil system config")
	})
	return cfg
}
Ejemplo n.º 14
0
// process synchronously invokes admin split for each proposed split key.
func (sq *splitQueue) process(
	ctx context.Context, now hlc.Timestamp, r *Replica, sysCfg config.SystemConfig,
) error {
	// First handle case of splitting due to zone config maps.
	desc := r.Desc()
	splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)
	if len(splitKeys) > 0 {
		log.Infof(ctx, "splitting at keys %v", splitKeys)
		for _, splitKey := range splitKeys {
			if _, pErr := r.adminSplitWithDescriptor(
				ctx,
				roachpb.AdminSplitRequest{
					SplitKey: splitKey.AsRawKey(),
				},
				desc,
			); pErr != nil {
				return errors.Wrapf(pErr.GoError(), "unable to split %s at key %q", r, splitKey)
			}
		}
		return nil
	}

	// Next handle case of splitting due to size.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	size := r.GetMVCCStats().Total()
	if float64(size)/float64(zone.RangeMaxBytes) > 1 {
		log.Infof(ctx, "splitting size=%d max=%d", size, zone.RangeMaxBytes)
		if _, pErr := r.adminSplitWithDescriptor(
			ctx,
			roachpb.AdminSplitRequest{},
			desc,
		); pErr != nil {
			return pErr.GoError()
		}
	}
	return nil
}
Ejemplo n.º 15
0
// shouldQueue determines whether a range should be queued for
// splitting. This is true if the range is intersected by a zone config
// prefix or if the range's size in bytes exceeds the limit for the zone.
func (sq *splitQueue) shouldQueue(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) (shouldQ bool, priority float64) {
	desc := repl.Desc()
	if len(sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)) > 0 {
		// Set priority to 1 in the event the range is split by zone configs.
		priority = 1
		shouldQ = true
	}

	// Add priority based on the size of range compared to the max
	// size for the zone it's in.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Error(ctx, err)
		return
	}

	if ratio := float64(repl.GetMVCCStats().Total()) / float64(zone.RangeMaxBytes); ratio > 1 {
		priority += ratio
		shouldQ = true
	}
	return
}
Ejemplo n.º 16
0
func TestGetLargestID(t *testing.T) {
	defer leaktest.AfterTest(t)()
	testCases := []struct {
		values  []roachpb.KeyValue
		largest uint32
		maxID   uint32
		errStr  string
	}{
		// No data.
		{nil, 0, 0, "descriptor table not found"},

		// Some data, but not from the system span.
		{[]roachpb.KeyValue{plainKV("a", "b")}, 0, 0, "descriptor table not found"},

		// Some real data, but no descriptors.
		{[]roachpb.KeyValue{
			sqlKV(keys.NamespaceTableID, 1, 1),
			sqlKV(keys.NamespaceTableID, 1, 2),
			sqlKV(keys.UsersTableID, 1, 3),
		}, 0, 0, "descriptor table not found"},

		// Single correct descriptor entry.
		{[]roachpb.KeyValue{sqlKV(keys.DescriptorTableID, 1, 1)}, 1, 0, ""},

		// Surrounded by other data.
		{[]roachpb.KeyValue{
			sqlKV(keys.NamespaceTableID, 1, 20),
			sqlKV(keys.NamespaceTableID, 1, 30),
			sqlKV(keys.DescriptorTableID, 1, 8),
			sqlKV(keys.ZonesTableID, 1, 40),
		}, 8, 0, ""},

		// Descriptors with holes. Index ID does not matter.
		{[]roachpb.KeyValue{
			sqlKV(keys.DescriptorTableID, 1, 1),
			sqlKV(keys.DescriptorTableID, 2, 5),
			sqlKV(keys.DescriptorTableID, 3, 8),
			sqlKV(keys.DescriptorTableID, 4, 12),
		}, 12, 0, ""},

		// Real SQL layout.
		{sqlbase.MakeMetadataSchema().GetInitialValues(), keys.MaxSystemConfigDescID + 4, 0, ""},

		// Test non-zero max.
		{[]roachpb.KeyValue{
			sqlKV(keys.DescriptorTableID, 1, 1),
			sqlKV(keys.DescriptorTableID, 2, 5),
			sqlKV(keys.DescriptorTableID, 3, 8),
			sqlKV(keys.DescriptorTableID, 4, 12),
		}, 8, 8, ""},

		// Test non-zero max.
		{[]roachpb.KeyValue{
			sqlKV(keys.DescriptorTableID, 1, 1),
			sqlKV(keys.DescriptorTableID, 2, 5),
			sqlKV(keys.DescriptorTableID, 3, 8),
			sqlKV(keys.DescriptorTableID, 4, 12),
		}, 5, 7, ""},
	}

	cfg := config.SystemConfig{}
	for tcNum, tc := range testCases {
		cfg.Values = tc.values
		ret, err := cfg.GetLargestObjectID(tc.maxID)
		if !testutils.IsError(err, tc.errStr) {
			t.Errorf("#%d: expected err=%q, got %v", tcNum, tc.errStr, err)
			continue
		}
		if err != nil {
			continue
		}
		if ret != tc.largest {
			t.Errorf("#%d: expected largest=%d, got %d", tcNum, tc.largest, ret)
		}
	}
}
Ejemplo n.º 17
0
func (rq *replicateQueue) process(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) error {
	desc := repl.Desc()
	// Find the zone config for this range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	action, _ := rq.allocator.ComputeAction(zone, desc)

	// Avoid taking action if the range has too many dead replicas to make
	// quorum.
	deadReplicas := rq.allocator.storePool.deadReplicas(desc.RangeID, desc.Replicas)
	quorum := computeQuorum(len(desc.Replicas))
	liveReplicaCount := len(desc.Replicas) - len(deadReplicas)
	if liveReplicaCount < quorum {
		return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.")
	}

	switch action {
	case AllocatorAdd:
		log.Event(ctx, "adding a new replica")
		newStore, err := rq.allocator.AllocateTarget(
			zone.Constraints,
			desc.Replicas,
			desc.RangeID,
			true,
		)
		if err != nil {
			return err
		}
		newReplica := roachpb.ReplicaDescriptor{
			NodeID:  newStore.Node.NodeID,
			StoreID: newStore.StoreID,
		}

		log.VEventf(ctx, 1, "adding replica to %+v due to under-replication", newReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, newReplica, desc); err != nil {
			return err
		}
	case AllocatorRemove:
		log.Event(ctx, "removing a replica")
		// We require the lease in order to process replicas, so
		// repl.store.StoreID() corresponds to the lease-holder's store ID.
		removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas, repl.store.StoreID())
		if err != nil {
			return err
		}
		log.VEventf(ctx, 1, "removing replica %+v due to over-replication", removeReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil {
			return err
		}
		// Do not requeue if we removed ourselves.
		if removeReplica.StoreID == repl.store.StoreID() {
			return nil
		}
	case AllocatorRemoveDead:
		log.Event(ctx, "removing a dead replica")
		if len(deadReplicas) == 0 {
			if log.V(1) {
				log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl)
			}
			break
		}
		deadReplica := deadReplicas[0]
		log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil {
			return err
		}
	case AllocatorNoop:
		log.Event(ctx, "considering a rebalance")
		// The Noop case will result if this replica was queued in order to
		// rebalance. Attempt to find a rebalancing target.
		//
		// We require the lease in order to process replicas, so
		// repl.store.StoreID() corresponds to the lease-holder's store ID.
		rebalanceStore := rq.allocator.RebalanceTarget(
			zone.Constraints,
			desc.Replicas,
			repl.store.StoreID(),
			desc.RangeID,
		)
		if rebalanceStore == nil {
			log.VEventf(ctx, 1, "no suitable rebalance target")
			// No action was necessary and no rebalance target was found. Return
			// without re-queuing this replica.
			return nil
		}
		rebalanceReplica := roachpb.ReplicaDescriptor{
			NodeID:  rebalanceStore.Node.NodeID,
			StoreID: rebalanceStore.StoreID,
		}
		log.VEventf(ctx, 1, "rebalancing to %+v", rebalanceReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil {
			return err
		}
	}

	// Enqueue this replica again to see if there are more changes to be made.
	rq.MaybeAdd(repl, rq.clock.Now())
	return nil
}
Ejemplo n.º 18
0
func expectDeleted(systemConfig config.SystemConfig, key roachpb.Key) error {
	if systemConfig.GetValue(key) != nil {
		return errStaleMetadata
	}
	return nil
}
Ejemplo n.º 19
0
func (rq *replicateQueue) processOneChange(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) error {
	desc := repl.Desc()
	// Find the zone config for this range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	action, _ := rq.allocator.ComputeAction(zone, desc)

	// Avoid taking action if the range has too many dead replicas to make
	// quorum.
	deadReplicas := rq.allocator.storePool.deadReplicas(desc.RangeID, desc.Replicas)
	quorum := computeQuorum(len(desc.Replicas))
	liveReplicaCount := len(desc.Replicas) - len(deadReplicas)
	if liveReplicaCount < quorum {
		return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.")
	}

	switch action {
	case AllocatorAdd:
		log.Event(ctx, "adding a new replica")
		newStore, err := rq.allocator.AllocateTarget(
			zone.Constraints,
			desc.Replicas,
			desc.RangeID,
			true,
		)
		if err != nil {
			return err
		}
		newReplica := roachpb.ReplicaDescriptor{
			NodeID:  newStore.Node.NodeID,
			StoreID: newStore.StoreID,
		}

		log.VEventf(ctx, 1, "adding replica to %+v due to under-replication", newReplica)
		if err := rq.addReplica(ctx, repl, newReplica, desc); err != nil {
			return err
		}
	case AllocatorRemove:
		log.Event(ctx, "removing a replica")
		// If the lease holder (our local store) is an overfull store (in terms of
		// leases) allow transferring the lease away.
		leaseHolderStoreID := repl.store.StoreID()
		if rq.allocator.ShouldTransferLease(zone.Constraints, leaseHolderStoreID, desc.RangeID) {
			leaseHolderStoreID = 0
		}
		removeReplica, err := rq.allocator.RemoveTarget(
			zone.Constraints,
			desc.Replicas,
			leaseHolderStoreID,
		)
		if err != nil {
			return err
		}
		if removeReplica.StoreID == repl.store.StoreID() {
			// The local replica was selected as the removal target, but that replica
			// is the leaseholder, so transfer the lease instead. We don't check that
			// the current store has too many leases in this case under the
			// assumption that replica balance is a greater concern. Also note that
			// AllocatorRemove action takes preference over AllocatorNoop
			// (rebalancing) which is where lease transfer would otherwise occur. We
			// need to be able to transfer leases in AllocatorRemove in order to get
			// out of situations where this store is overfull and yet holds all the
			// leases.
			candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas)
			target := rq.allocator.TransferLeaseTarget(
				zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID,
				false /* checkTransferLeaseSource */)
			if target != (roachpb.ReplicaDescriptor{}) {
				log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID)
				if err := repl.AdminTransferLease(target.StoreID); err != nil {
					return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID)
				}
				rq.lastLeaseTransfer.Store(timeutil.Now())
				// Do not requeue as we transferred our lease away.
				return nil
			}
		} else {
			log.VEventf(ctx, 1, "removing replica %+v due to over-replication", removeReplica)
			if err := rq.removeReplica(ctx, repl, removeReplica, desc); err != nil {
				return err
			}
		}
	case AllocatorRemoveDead:
		log.Event(ctx, "removing a dead replica")
		if len(deadReplicas) == 0 {
			if log.V(1) {
				log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl)
			}
			break
		}
		deadReplica := deadReplicas[0]
		log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica)
		if err := repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil {
			return err
		}
	case AllocatorNoop:
		// The Noop case will result if this replica was queued in order to
		// rebalance. Attempt to find a rebalancing target.
		log.Event(ctx, "considering a rebalance")

		if rq.canTransferLease() {
			// We require the lease in order to process replicas, so
			// repl.store.StoreID() corresponds to the lease-holder's store ID.
			candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas)
			target := rq.allocator.TransferLeaseTarget(
				zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID,
				true /* checkTransferLeaseSource */)
			if target != (roachpb.ReplicaDescriptor{}) {
				log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID)
				if err := repl.AdminTransferLease(target.StoreID); err != nil {
					return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID)
				}
				rq.lastLeaseTransfer.Store(timeutil.Now())
				// Do not requeue as we transferred our lease away.
				return nil
			}
		}

		rebalanceStore, err := rq.allocator.RebalanceTarget(
			zone.Constraints,
			desc.Replicas,
			repl.store.StoreID(),
			desc.RangeID,
		)
		if err != nil {
			log.ErrEventf(ctx, "rebalance target failed %s", err)
			return nil
		}
		if rebalanceStore == nil {
			log.VEventf(ctx, 1, "no suitable rebalance target")
			// No action was necessary and no rebalance target was found. Return
			// without re-queuing this replica.
			return nil
		}
		rebalanceReplica := roachpb.ReplicaDescriptor{
			NodeID:  rebalanceStore.Node.NodeID,
			StoreID: rebalanceStore.StoreID,
		}
		log.VEventf(ctx, 1, "rebalancing to %+v", rebalanceReplica)
		if err := rq.addReplica(ctx, repl, rebalanceReplica, desc); err != nil {
			return err
		}
	}

	return nil
}
Ejemplo n.º 20
0
func (rq *replicateQueue) shouldQueue(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) (shouldQ bool, priority float64) {
	if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() {
		// If the range exceeds the split threshold, let that finish first.
		// Ranges must fit in memory on both sender and receiver nodes while
		// being replicated. This supplements the check provided by
		// acceptsUnsplitRanges, which looks at zone config boundaries rather
		// than data size.
		//
		// This check is ignored if the split queue is disabled, since in that
		// case, the split will never come.
		return
	}

	// Find the zone config for this range.
	desc := repl.Desc()
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Error(ctx, err)
		return
	}

	action, priority := rq.allocator.ComputeAction(zone, desc)
	if action != AllocatorNoop {
		if log.V(2) {
			log.Infof(ctx, "%s repair needed (%s), enqueuing", repl, action)
		}
		return true, priority
	}

	// If we hold the lease, check to see if we should transfer it.
	var leaseStoreID roachpb.StoreID
	if lease, _ := repl.getLease(); lease != nil && lease.Covers(now) {
		leaseStoreID = lease.Replica.StoreID
		if rq.canTransferLease() &&
			rq.allocator.ShouldTransferLease(zone.Constraints, leaseStoreID, desc.RangeID) {
			if log.V(2) {
				log.Infof(ctx, "%s lease transfer needed, enqueuing", repl)
			}
			return true, 0
		}
	}

	// Check for a rebalancing opportunity. Note that leaseStoreID will be 0 if
	// the range doesn't currently have a lease which will allow the current
	// replica to be considered a rebalancing source.
	target, err := rq.allocator.RebalanceTarget(
		zone.Constraints,
		desc.Replicas,
		leaseStoreID,
		desc.RangeID,
	)
	if err != nil {
		log.ErrEventf(ctx, "rebalance target failed: %s", err)
		return false, 0
	}
	if log.V(2) {
		if target != nil {
			log.Infof(ctx, "%s rebalance target found, enqueuing", repl)
		} else {
			log.Infof(ctx, "%s no rebalance target found, not enqueuing", repl)
		}
	}
	return target != nil, 0
}
Ejemplo n.º 21
0
func TestComputeSplits(t *testing.T) {
	defer leaktest.AfterTest(t)()

	const (
		start         = keys.MaxReservedDescID + 1
		reservedStart = keys.MaxSystemConfigDescID + 1
	)

	schema := sqlbase.MakeMetadataSchema()
	// Real system tables only.
	baseSql := schema.GetInitialValues()
	// Real system tables plus some user stuff.
	allSql := append(schema.GetInitialValues(),
		descriptor(start), descriptor(start+1), descriptor(start+5))
	sort.Sort(roachpb.KeyValueByKey(allSql))

	allUserSplits := []uint32{start, start + 1, start + 2, start + 3, start + 4, start + 5}
	var allReservedSplits []uint32
	for i := 0; i < schema.SystemDescriptorCount()-schema.SystemConfigDescriptorCount(); i++ {
		allReservedSplits = append(allReservedSplits, reservedStart+uint32(i))
	}
	allSplits := append(allReservedSplits, allUserSplits...)

	testCases := []struct {
		values     []roachpb.KeyValue
		start, end roachpb.RKey
		// Use ints in the testcase definitions, more readable.
		splits []uint32
	}{
		// No data.
		{nil, roachpb.RKeyMin, roachpb.RKeyMax, nil},
		{nil, keys.MakeTablePrefix(start), roachpb.RKeyMax, nil},
		{nil, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), nil},
		{nil, roachpb.RKeyMin, keys.MakeTablePrefix(start + 10), nil},

		// Reserved descriptors.
		{baseSql, roachpb.RKeyMin, roachpb.RKeyMax, allReservedSplits},
		{baseSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, nil},
		{baseSql, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), nil},
		{baseSql, roachpb.RKeyMin, keys.MakeTablePrefix(start + 10), allReservedSplits},
		{baseSql, keys.MakeTablePrefix(reservedStart), roachpb.RKeyMax, allReservedSplits[1:]},
		{baseSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(start + 10), allReservedSplits[1:]},
		{baseSql, roachpb.RKeyMin, keys.MakeTablePrefix(reservedStart + 2), allReservedSplits[:2]},
		{baseSql, roachpb.RKeyMin, keys.MakeTablePrefix(reservedStart + 10), allReservedSplits},
		{baseSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(reservedStart + 2), allReservedSplits[1:2]},
		{baseSql, testutils.MakeKey(keys.MakeTablePrefix(reservedStart), roachpb.RKey("foo")),
			testutils.MakeKey(keys.MakeTablePrefix(start+10), roachpb.RKey("foo")), allReservedSplits[1:]},

		// Reserved + User descriptors.
		{allSql, keys.MakeTablePrefix(start - 1), roachpb.RKeyMax, allUserSplits},
		{allSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, allUserSplits[1:]},
		{allSql, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), allUserSplits[1:]},
		{allSql, keys.MakeTablePrefix(start - 1), keys.MakeTablePrefix(start + 10), allUserSplits},
		{allSql, keys.MakeTablePrefix(start + 4), keys.MakeTablePrefix(start + 10), allUserSplits[5:]},
		{allSql, keys.MakeTablePrefix(start + 5), keys.MakeTablePrefix(start + 10), nil},
		{allSql, keys.MakeTablePrefix(start + 6), keys.MakeTablePrefix(start + 10), nil},
		{allSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")),
			keys.MakeTablePrefix(start + 10), allUserSplits[1:]},
		{allSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")),
			keys.MakeTablePrefix(start + 5), allUserSplits[1:5]},
		{allSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")),
			testutils.MakeKey(keys.MakeTablePrefix(start+5), roachpb.RKey("bar")), allUserSplits[1:5]},
		{allSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")),
			testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("morefoo")), nil},
		{allSql, roachpb.RKeyMin, roachpb.RKeyMax, allSplits},
		{allSql, keys.MakeTablePrefix(reservedStart + 1), roachpb.RKeyMax, allSplits[2:]},
		{allSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(start + 10), allSplits[1:]},
		{allSql, roachpb.RKeyMin, keys.MakeTablePrefix(start + 2), allSplits[:6]},
		{allSql, testutils.MakeKey(keys.MakeTablePrefix(reservedStart), roachpb.RKey("foo")),
			testutils.MakeKey(keys.MakeTablePrefix(start+5), roachpb.RKey("foo")), allSplits[1:9]},
	}

	cfg := config.SystemConfig{}
	for tcNum, tc := range testCases {
		cfg.Values = tc.values
		splits := cfg.ComputeSplitKeys(tc.start, tc.end)
		if len(splits) == 0 && len(tc.splits) == 0 {
			continue
		}

		// Convert ints to actual keys.
		expected := []roachpb.RKey{}
		for _, s := range tc.splits {
			expected = append(expected, keys.MakeRowSentinelKey(keys.MakeTablePrefix(s)))
		}
		if !reflect.DeepEqual(splits, expected) {
			t.Errorf("#%d: bad splits:\ngot: %v\nexpected: %v", tcNum, splits, expected)
		}
	}
}