// shouldQueue determines whether a replica should be queued for garbage // collection, and if so, at what priority. Returns true for shouldQ // in the event that the cumulative ages of GC'able bytes or extant // intents exceed thresholds. func (*gcQueue) shouldQueue(now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig) (shouldQ bool, priority float64) { desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Errorf(context.TODO(), "could not find zone config for range %s: %s", repl, err) return } ms := repl.GetMVCCStats() // GC score is the total GC'able bytes age normalized by 1 MB * the replica's TTL in seconds. gcScore := float64(ms.GCByteAge(now.WallTime)) / float64(zone.GC.TTLSeconds) / float64(gcByteCountNormalization) // Intent score. This computes the average age of outstanding intents // and normalizes. intentScore := ms.AvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1E9) // Compute priority. if gcScore >= considerThreshold { priority += gcScore } if intentScore >= considerThreshold { priority += intentScore } shouldQ = priority > 0 return }
func (rq *replicateQueue) shouldQueue(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) (shouldQ bool, priority float64) { if repl.needsSplitBySize() { // If the range exceeds the split threshold, let that finish // first. Ranges must fit in memory on both sender and receiver // nodes while being replicated. This supplements the check // provided by acceptsUnsplitRanges, which looks at zone config // boundaries rather than data size. return } // Find the zone config for this range. desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(err) return } action, priority := rq.allocator.ComputeAction(*zone, desc) if action != AllocatorNoop { return true, priority } // See if there is a rebalancing opportunity present. shouldRebalance := rq.allocator.ShouldRebalance(repl.store.StoreID()) return shouldRebalance, 0 }
// GetZoneConfig returns the zone config for the object with 'id'. func GetZoneConfig(cfg config.SystemConfig, id uint32) (config.ZoneConfig, bool, error) { // Look in the zones table. if zoneVal := cfg.GetValue(sqlbase.MakeZoneKey(sqlbase.ID(id))); zoneVal != nil { var zone config.ZoneConfig // We're done. return zone, true, zoneVal.GetProto(&zone) } // No zone config for this ID. We need to figure out if it's a database // or table. Lookup its descriptor. if descVal := cfg.GetValue(sqlbase.MakeDescMetadataKey(sqlbase.ID(id))); descVal != nil { // Determine whether this is a database or table. var desc sqlbase.Descriptor if err := descVal.GetProto(&desc); err != nil { return config.ZoneConfig{}, false, err } if tableDesc := desc.GetTable(); tableDesc != nil { // This is a table descriptor. Lookup its parent database zone config. return GetZoneConfig(cfg, uint32(tableDesc.ParentID)) } } // Retrieve the default zone config, but only as long as that wasn't the ID // we were trying to retrieve (avoid infinite recursion). if id != keys.RootNamespaceID { return GetZoneConfig(cfg, keys.RootNamespaceID) } // No descriptor or not a table. return config.ZoneConfig{}, false, nil }
func (rq *replicateQueue) shouldQueue( now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() { // If the range exceeds the split threshold, let that finish first. // Ranges must fit in memory on both sender and receiver nodes while // being replicated. This supplements the check provided by // acceptsUnsplitRanges, which looks at zone config boundaries rather // than data size. // // This check is ignored if the split queue is disabled, since in that // case, the split will never come. return } // Find the zone config for this range. desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(err) return } action, priority := rq.allocator.ComputeAction(*zone, desc) if action != AllocatorNoop { return true, priority } // See if there is a rebalancing opportunity present. shouldRebalance := rq.allocator.ShouldRebalance(repl.store.StoreID()) return shouldRebalance, 0 }
// shouldQueue determines whether a replica should be queued for garbage // collection, and if so, at what priority. Returns true for shouldQ // in the event that the cumulative ages of GC'able bytes or extant // intents exceed thresholds. func (gcq *gcQueue) shouldQueue(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) (shouldQ bool, priority float64) { desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Errorf("could not find GC policy for range %s: %s", repl, err) return } policy := zone.GC // GC score is the total GC'able bytes age normalized by 1 MB * the replica's TTL in seconds. gcScore := float64(repl.stats.GetGCBytesAge(now.WallTime)) / float64(policy.TTLSeconds) / float64(gcByteCountNormalization) // Intent score. This computes the average age of outstanding intents // and normalizes. intentScore := repl.stats.GetAvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1E9) // Compute priority. if gcScore > 1 { priority += gcScore } if intentScore > 1 { priority += intentScore } shouldQ = priority > 0 return }
// GetZoneConfig returns the zone config for the object with 'id'. func GetZoneConfig(cfg config.SystemConfig, id uint32) (*config.ZoneConfig, error) { // Look in the zones table. if zoneVal := cfg.GetValue(MakeZoneKey(ID(id))); zoneVal != nil { zone := &config.ZoneConfig{} if err := zoneVal.GetProto(zone); err != nil { return nil, err } // We're done. return zone, nil } // No zone config for this ID. We need to figure out if it's a database // or table. Lookup its descriptor. if descVal := cfg.GetValue(MakeDescMetadataKey(ID(id))); descVal != nil { // Determine whether this is a database or table. desc := &Descriptor{} if err := descVal.GetProto(desc); err != nil { return nil, err } if tableDesc := desc.GetTable(); tableDesc != nil { // This is a table descriptor. Lookup its parent database zone config. return GetZoneConfig(cfg, uint32(tableDesc.ParentID)) } } // No descriptor or not a table. This table/db could have been deleted, just // return the default config. return config.DefaultZoneConfig, nil }
// process synchronously invokes admin split for each proposed split key. func (sq *splitQueue) process(now roachpb.Timestamp, rng *Replica, sysCfg *config.SystemConfig) error { // First handle case of splitting due to zone config maps. desc := rng.Desc() splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey) if len(splitKeys) > 0 { log.Infof("splitting %s at keys %v", rng, splitKeys) for _, splitKey := range splitKeys { if err := sq.db.AdminSplit(splitKey.AsRawKey()); err != nil { return util.Errorf("unable to split %s at key %q: %s", rng, splitKey, err) } } return nil } // Next handle case of splitting due to size. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } // FIXME: why is this implementation not the same as the one above? if float64(rng.stats.GetSize())/float64(zone.RangeMaxBytes) > 1 { log.Infof("splitting %s size=%d max=%d", rng, rng.stats.GetSize(), zone.RangeMaxBytes) if _, pErr := client.SendWrapped(rng, rng.context(), &roachpb.AdminSplitRequest{ Span: roachpb.Span{Key: desc.StartKey.AsRawKey()}, }); pErr != nil { return pErr.GoError() } } return nil }
func (bq *baseQueue) requiresSplit(cfg config.SystemConfig, repl *Replica) bool { // If there's no store (as is the case in some narrow unit tests), or if // the store's split queue is disabled, the "required" split will never // come. In that case, pretend we don't require the split. if store := repl.store; store == nil || store.splitQueue.Disabled() { return false } desc := repl.Desc() return !bq.acceptsUnsplitRanges && cfg.NeedsSplit(desc.StartKey, desc.EndKey) }
// GetTableDesc returns the table descriptor for the table with 'id'. // Returns nil if the descriptor is not present, or is present but is not a // table. func GetTableDesc(cfg config.SystemConfig, id sqlbase.ID) (*sqlbase.TableDescriptor, error) { if descVal := cfg.GetValue(sqlbase.MakeDescMetadataKey(id)); descVal != nil { desc := &sqlbase.Descriptor{} if err := descVal.GetProto(desc); err != nil { return nil, err } return desc.GetTable(), nil } return nil, nil }
func isDeleted(tableID sqlbase.ID, cfg config.SystemConfig) bool { descKey := sqlbase.MakeDescMetadataKey(tableID) val := cfg.GetValue(descKey) if val == nil { return false } var descriptor sqlbase.Descriptor if err := val.GetProto(&descriptor); err != nil { panic("unable to unmarshal table descriptor") } table := descriptor.GetTable() return table.Deleted() }
func expectDescriptor(systemConfig config.SystemConfig, idKey roachpb.Key, desc *Descriptor) error { descValue := systemConfig.GetValue(idKey) if descValue == nil { return errStaleMetadata } var cachedDesc Descriptor if err := descValue.GetProto(&cachedDesc); err != nil { return err } if !proto.Equal(&cachedDesc, desc) { return errStaleMetadata } return nil }
func expectDescriptorID(systemConfig config.SystemConfig, idKey roachpb.Key, id ID) error { idValue := systemConfig.GetValue(idKey) if idValue == nil { return errStaleMetadata } cachedID, err := idValue.GetInt() if err != nil { return err } if ID(cachedID) != id { return errStaleMetadata } return nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } gcq.eventLog.VInfof(true, "completed with stats %+v", info) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { return pErr.GoError() } return nil }
func waitForConfigChange(t *testing.T, s *server.TestServer) (*config.SystemConfig, error) { var foundDesc sql.DatabaseDescriptor var cfg *config.SystemConfig return cfg, util.IsTrueWithin(func() bool { if cfg = s.Gossip().GetSystemConfig(); cfg != nil { if val := cfg.GetValue(configDescKey); val != nil { if err := val.GetProto(&foundDesc); err != nil { t.Fatal(err) } return foundDesc.ID == configID } } return false }, 10*time.Second) }
func TestGet(t *testing.T) { defer leaktest.AfterTest(t) emptyKeys := []proto.KeyValue{} someKeys := []proto.KeyValue{ plainKV("a", "vala"), plainKV("c", "valc"), plainKV("d", "vald"), } testCases := []struct { values []proto.KeyValue key string found bool value string }{ {emptyKeys, "a", false, ""}, {emptyKeys, "b", false, ""}, {emptyKeys, "c", false, ""}, {emptyKeys, "d", false, ""}, {emptyKeys, "e", false, ""}, {someKeys, "", false, ""}, {someKeys, "b", false, ""}, {someKeys, "e", false, ""}, {someKeys, "a0", false, ""}, {someKeys, "a", true, "vala"}, {someKeys, "c", true, "valc"}, {someKeys, "d", true, "vald"}, } cfg := config.SystemConfig{} for tcNum, tc := range testCases { cfg.Values = tc.values val, found := cfg.GetValue([]byte(tc.key)) if found != tc.found { t.Errorf("#%d: expected found=%t", tcNum, tc.found) continue } if string(val) != tc.value { t.Errorf("#%d: expected value=%s, found %s", tcNum, tc.value, string(val)) } } }
// isRenamed tests if a descriptor is updated by gossip to the specified name // and version. func isRenamed( tableID sqlbase.ID, expectedName string, expectedVersion sqlbase.DescriptorVersion, cfg config.SystemConfig, ) bool { descKey := sqlbase.MakeDescMetadataKey(tableID) val := cfg.GetValue(descKey) if val == nil { return false } var descriptor sqlbase.Descriptor if err := val.GetProto(&descriptor); err != nil { panic("unable to unmarshal table descriptor") } table := descriptor.GetTable() return table.Name == expectedName && table.Version == expectedVersion }
func waitForConfigChange(t *testing.T, s *testServer) *config.SystemConfig { var foundDesc sql.Descriptor var cfg *config.SystemConfig util.SucceedsSoon(t, func() error { if cfg = s.Gossip().GetSystemConfig(); cfg != nil { if val := cfg.GetValue(configDescKey); val != nil { if err := val.GetProto(&foundDesc); err != nil { t.Fatal(err) } if id := foundDesc.GetDatabase().GetID(); id != configID { return util.Errorf("expected database id %d; got %d", configID, id) } return nil } } return util.Errorf("got nil system config") }) return cfg }
func TestGet(t *testing.T) { defer leaktest.AfterTest(t) emptyKeys := []roachpb.KeyValue{} someKeys := []roachpb.KeyValue{ plainKV("a", "vala"), plainKV("c", "valc"), plainKV("d", "vald"), } aVal := roachpb.MakeValueFromString("vala") bVal := roachpb.MakeValueFromString("valc") cVal := roachpb.MakeValueFromString("vald") testCases := []struct { values []roachpb.KeyValue key string value *roachpb.Value }{ {emptyKeys, "a", nil}, {emptyKeys, "b", nil}, {emptyKeys, "c", nil}, {emptyKeys, "d", nil}, {emptyKeys, "e", nil}, {someKeys, "", nil}, {someKeys, "b", nil}, {someKeys, "e", nil}, {someKeys, "a0", nil}, {someKeys, "a", &aVal}, {someKeys, "c", &bVal}, {someKeys, "d", &cVal}, } cfg := config.SystemConfig{} for tcNum, tc := range testCases { cfg.Values = tc.values if val := cfg.GetValue([]byte(tc.key)); !proto.Equal(val, tc.value) { t.Errorf("#%d: expected=%s, found=%s", tcNum, tc.value, val) } } }
// GetZoneConfig returns the zone config for the object with 'id'. func GetZoneConfig(cfg *config.SystemConfig, id uint32) (*config.ZoneConfig, error) { // Look in the zones table. if val, ok := cfg.GetValue(MakeZoneKey(ID(id))); ok { zone := &config.ZoneConfig{} if err := proto.Unmarshal(val, zone); err != nil { return nil, err } // We're done. return zone, nil } // No zone config for this ID. We need to figure out if it's a database // or table. Lookup its descriptor. rawDesc, ok := cfg.GetValue(MakeDescMetadataKey(ID(id))) if !ok { // No descriptor. This table/db could have been deleted, // just return the default config. return config.DefaultZoneConfig, nil } // Determine whether this is a database or table. // TODO(marc): we need a better way of doing this. Options include: // - add a type field on the descriptor table // - separate descriptor tables for databases and tables // - prebuild list of databases and tables in the system config var dbDesc DatabaseDescriptor if err := proto.Unmarshal(rawDesc, &dbDesc); err == nil { // parses as a database: return default config. return config.DefaultZoneConfig, nil } var tableDesc TableDescriptor if err := proto.Unmarshal(rawDesc, &tableDesc); err != nil { // does not parse as a table either: this means an entry in the // descriptor table we're not familiar with. return nil, util.Errorf("descriptor for object ID %d is not a table or database", id) } // This is a table descriptor. Lookup its parent database zone config. return GetZoneConfig(cfg, uint32(tableDesc.ParentID)) }
// process synchronously invokes admin split for each proposed split key. func (sq *splitQueue) process( ctx context.Context, now hlc.Timestamp, rng *Replica, sysCfg config.SystemConfig, ) error { // First handle case of splitting due to zone config maps. desc := rng.Desc() splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey) if len(splitKeys) > 0 { log.Infof("splitting %s at keys %v", rng, splitKeys) log.Trace(ctx, fmt.Sprintf("splitting at keys %v", splitKeys)) for _, splitKey := range splitKeys { if err := sq.db.AdminSplit(splitKey.AsRawKey()); err != nil { return errors.Errorf("unable to split %s at key %q: %s", rng, splitKey, err) } } return nil } // Next handle case of splitting due to size. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } size := rng.GetMVCCStats().Total() // FIXME: why is this implementation not the same as the one above? if float64(size)/float64(zone.RangeMaxBytes) > 1 { log.Infof("splitting %s size=%d max=%d", rng, size, zone.RangeMaxBytes) log.Trace(ctx, fmt.Sprintf("splitting size=%d max=%d", size, zone.RangeMaxBytes)) if _, pErr := client.SendWrappedWith(rng, ctx, roachpb.Header{ Timestamp: now, }, &roachpb.AdminSplitRequest{ Span: roachpb.Span{Key: desc.StartKey.AsRawKey()}, }); pErr != nil { return pErr.GoError() } } return nil }
// shouldQueue determines whether a range should be queued for // splitting. This is true if the range is intersected by a zone config // prefix or if the range's size in bytes exceeds the limit for the zone. func (*splitQueue) shouldQueue(now roachpb.Timestamp, rng *Replica, sysCfg *config.SystemConfig) (shouldQ bool, priority float64) { desc := rng.Desc() if len(sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)) > 0 { // Set priority to 1 in the event the range is split by zone configs. priority = 1 shouldQ = true } // Add priority based on the size of range compared to the max // size for the zone it's in. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(err) return } if ratio := float64(rng.stats.GetSize()) / float64(zone.RangeMaxBytes); ratio > 1 { priority += ratio shouldQ = true } return }
func (rq *replicateQueue) shouldQueue(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) (shouldQ bool, priority float64) { desc := repl.Desc() if len(sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)) > 0 { // If the replica's range needs splitting, wait until done. return } // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(err) return } action, priority := rq.allocator.ComputeAction(*zone, desc) if action != AllocatorNoop { return true, priority } // See if there is a rebalancing opportunity present. shouldRebalance := rq.allocator.ShouldRebalance(repl.store.StoreID()) return shouldRebalance, 0 }
func expectDeleted(systemConfig config.SystemConfig, key roachpb.Key) error { if systemConfig.GetValue(key) != nil { return errStaleMetadata } return nil }
func TestComputeSplits(t *testing.T) { defer leaktest.AfterTest(t) const ( start = keys.MaxReservedDescID + 1 reservedStart = keys.MaxSystemConfigDescID + 1 ) schema := sql.MakeMetadataSchema() // Real SQL system tables only. baseSql := schema.GetInitialValues() // Real SQL system tables plus some user stuff. userSql := append(schema.GetInitialValues(), descriptor(start), descriptor(start+1), descriptor(start+5)) // Real SQL system with reserved non-system tables. schema.AddTable(reservedStart+1, "CREATE TABLE system.test1 (i INT PRIMARY KEY)", privilege.List{privilege.ALL}) schema.AddTable(reservedStart+2, "CREATE TABLE system.test2 (i INT PRIMARY KEY)", privilege.List{privilege.ALL}) reservedSql := schema.GetInitialValues() // Real SQL system with reserved non-system and user database. allSql := append(schema.GetInitialValues(), descriptor(start), descriptor(start+1), descriptor(start+5)) allUserSplits := []uint32{start, start + 1, start + 2, start + 3, start + 4, start + 5} allReservedSplits := []uint32{reservedStart, reservedStart + 1, reservedStart + 2} allSplits := append(allReservedSplits, allUserSplits...) testCases := []struct { values []roachpb.KeyValue start, end roachpb.RKey // Use ints in the testcase definitions, more readable. splits []uint32 }{ // No data. {nil, roachpb.RKeyMin, roachpb.RKeyMax, nil}, {nil, keys.MakeTablePrefix(start), roachpb.RKeyMax, nil}, {nil, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), nil}, {nil, roachpb.RKeyMin, keys.MakeTablePrefix(start + 10), nil}, // No user data. {baseSql, roachpb.RKeyMin, roachpb.RKeyMax, allReservedSplits[:1]}, {baseSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, nil}, {baseSql, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), nil}, {baseSql, roachpb.RKeyMin, keys.MakeTablePrefix(start + 10), allReservedSplits[:1]}, // User descriptors. {userSql, keys.MakeTablePrefix(start - 1), roachpb.RKeyMax, allUserSplits}, {userSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, allUserSplits[1:]}, {userSql, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), allUserSplits[1:]}, {userSql, keys.MakeTablePrefix(start - 1), keys.MakeTablePrefix(start + 10), allUserSplits}, {userSql, keys.MakeTablePrefix(start + 4), keys.MakeTablePrefix(start + 10), allUserSplits[5:]}, {userSql, keys.MakeTablePrefix(start + 5), keys.MakeTablePrefix(start + 10), nil}, {userSql, keys.MakeTablePrefix(start + 6), keys.MakeTablePrefix(start + 10), nil}, {userSql, keys.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), keys.MakeTablePrefix(start + 10), allUserSplits[1:]}, {userSql, keys.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), keys.MakeTablePrefix(start + 5), allUserSplits[1:5]}, {userSql, keys.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), keys.MakeKey(keys.MakeTablePrefix(start+5), roachpb.RKey("bar")), allUserSplits[1:5]}, {userSql, keys.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), keys.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("morefoo")), nil}, // Reserved descriptors. {reservedSql, roachpb.RKeyMin, roachpb.RKeyMax, allReservedSplits}, {reservedSql, keys.MakeTablePrefix(reservedStart), roachpb.RKeyMax, allReservedSplits[1:]}, {reservedSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, nil}, {reservedSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(start + 10), allReservedSplits[1:]}, {reservedSql, roachpb.RKeyMin, keys.MakeTablePrefix(reservedStart + 2), allReservedSplits[:2]}, {reservedSql, roachpb.RKeyMin, keys.MakeTablePrefix(reservedStart + 10), allReservedSplits}, {reservedSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(reservedStart + 2), allReservedSplits[1:2]}, {reservedSql, keys.MakeKey(keys.MakeTablePrefix(reservedStart), roachpb.RKey("foo")), keys.MakeKey(keys.MakeTablePrefix(start+10), roachpb.RKey("foo")), allReservedSplits[1:]}, // Reserved/User mix. {allSql, roachpb.RKeyMin, roachpb.RKeyMax, allSplits}, {allSql, keys.MakeTablePrefix(reservedStart + 1), roachpb.RKeyMax, allSplits[2:]}, {allSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, allSplits[4:]}, {allSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(start + 10), allSplits[1:]}, {allSql, roachpb.RKeyMin, keys.MakeTablePrefix(start + 2), allSplits[:5]}, {allSql, keys.MakeKey(keys.MakeTablePrefix(reservedStart), roachpb.RKey("foo")), keys.MakeKey(keys.MakeTablePrefix(start+5), roachpb.RKey("foo")), allSplits[1:8]}, } cfg := config.SystemConfig{} for tcNum, tc := range testCases { cfg.Values = tc.values splits := cfg.ComputeSplitKeys(tc.start, tc.end) if len(splits) == 0 && len(tc.splits) == 0 { continue } // Convert ints to actual keys. expected := []roachpb.RKey{} for _, s := range tc.splits { expected = append(expected, keys.MakeNonColumnKey(keys.MakeTablePrefix(s))) } if !reflect.DeepEqual(splits, expected) { t.Errorf("#%d: bad splits:\ngot: %v\nexpected: %v", tcNum, splits, expected) } } }
func TestGetLargestID(t *testing.T) { defer leaktest.AfterTest(t) testCases := []struct { values []roachpb.KeyValue largest uint32 maxID uint32 errStr string }{ // No data. {nil, 0, 0, "descriptor table not found"}, // Some data, but not from the system span. {[]roachpb.KeyValue{plainKV("a", "b")}, 0, 0, "descriptor table not found"}, // Some real data, but no descriptors. {[]roachpb.KeyValue{ sqlKV(keys.NamespaceTableID, 1, 1), sqlKV(keys.NamespaceTableID, 1, 2), sqlKV(keys.UsersTableID, 1, 3), }, 0, 0, "descriptor table not found"}, // Single correct descriptor entry. {[]roachpb.KeyValue{sqlKV(keys.DescriptorTableID, 1, 1)}, 1, 0, ""}, // Surrounded by other data. {[]roachpb.KeyValue{ sqlKV(keys.NamespaceTableID, 1, 20), sqlKV(keys.NamespaceTableID, 1, 30), sqlKV(keys.DescriptorTableID, 1, 8), sqlKV(keys.ZonesTableID, 1, 40), }, 8, 0, ""}, // Descriptors with holes. Index ID does not matter. {[]roachpb.KeyValue{ sqlKV(keys.DescriptorTableID, 1, 1), sqlKV(keys.DescriptorTableID, 2, 5), sqlKV(keys.DescriptorTableID, 3, 8), sqlKV(keys.DescriptorTableID, 4, 12), }, 12, 0, ""}, // Real SQL layout. {sql.MakeMetadataSchema().GetInitialValues(), keys.MaxSystemConfigDescID + 1, 0, ""}, // Test non-zero max. {[]roachpb.KeyValue{ sqlKV(keys.DescriptorTableID, 1, 1), sqlKV(keys.DescriptorTableID, 2, 5), sqlKV(keys.DescriptorTableID, 3, 8), sqlKV(keys.DescriptorTableID, 4, 12), }, 8, 8, ""}, // Test non-zero max. {[]roachpb.KeyValue{ sqlKV(keys.DescriptorTableID, 1, 1), sqlKV(keys.DescriptorTableID, 2, 5), sqlKV(keys.DescriptorTableID, 3, 8), sqlKV(keys.DescriptorTableID, 4, 12), }, 5, 7, ""}, } cfg := config.SystemConfig{} for tcNum, tc := range testCases { cfg.Values = tc.values ret, err := cfg.GetLargestObjectID(tc.maxID) if tc.errStr == "" { if err != nil { t.Errorf("#%d: error: %v", tcNum, err) continue } } else if !testutils.IsError(err, tc.errStr) { t.Errorf("#%d: expected err=%s, got %v", tcNum, tc.errStr, err) continue } if ret != tc.largest { t.Errorf("#%d: expected largest=%d, got %d", tcNum, tc.largest, ret) } } }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) error { snap := repl.rm.Engine().NewSnapshot() desc := repl.Desc() iter := newRangeDataIterator(desc, snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return fmt.Errorf("could not find GC policy for range %s: %s", repl, err) } policy := zone.GC gcMeta := roachpb.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, *policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() // TODO(tschottdorf): execution will use a leader-assigned local // timestamp to compute intent age. While this should be fine, could // consider adding a Now timestamp to GCRequest which would be used // instead. gcArgs := &roachpb.GCRequest{ RequestHeader: roachpb.RequestHeader{ RangeID: desc.RangeID, }, } var mu sync.Mutex var oldestIntentNanos int64 = math.MaxInt64 var expBaseKey roachpb.Key var keys []roachpb.EncodedKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*roachpb.Transaction{} intentMap := map[string][]roachpb.Intent{} // updateOldestIntent atomically updates the oldest intent. updateOldestIntent := func(intentNanos int64) { mu.Lock() defer mu.Unlock() if intentNanos < oldestIntentNanos { oldestIntentNanos = intentNanos } } // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentMap[id] = append(intentMap[id], roachpb.Intent{Key: expBaseKey}) } else { updateOldestIntent(meta.Txn.OrigTimestamp.WallTime) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err) continue } if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []roachpb.EncodedKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { wg.Add(1) go gcq.pushTxn(repl, now, txn, updateOldestIntent, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for id, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentMap[id] { intent.Txn = *txn intents = append(intents, intent) } } } done := true if len(intents) > 0 { done = false repl.resolveIntents(repl.context(), intents) } // Set start and end keys. if len(gcArgs.Keys) > 0 { done = false gcArgs.Key = gcArgs.Keys[0].Key gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key.Next() } if done { return nil } // Send GC request through range. gcMeta.OldestIntentNanos = proto.Int64(oldestIntentNanos) gcArgs.GCMeta = *gcMeta if _, err := client.SendWrapped(repl, repl.context(), gcArgs); err != nil { return err } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and sequence cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new sequence cache entry // * obtaining the transaction for a sequence cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // sequence cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the sequence table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap, true /* replicatedOnly */) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return util.Errorf("could not find zone config for range %s: %s", repl, err) } gc := engine.NewGarbageCollector(now, zone.GC) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() gcArgs := &roachpb.GCRequest{} // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[uuid.UUID]*roachpb.Transaction{} intentSpanMap := map[uuid.UUID][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. var intentCount int processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { txnID := *meta.Txn.ID txn := &roachpb.Transaction{ TxnMeta: *meta.Txn, } txnMap[txnID] = txn intentCount++ intentSpanMap[txnID] = append(intentSpanMap[txnID], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { iterKey := iter.Key() if !iterKey.IsValue() || !iterKey.Key.Equal(expBaseKey) { // Moving to the next key (& values). processKeysAndValues() expBaseKey = iterKey.Key if !iterKey.IsValue() { keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} continue } // An implicit metadata. keys = []engine.MVCCKey{engine.MakeMVCCMetadataKey(iterKey.Key)} // A nil value for the encoded MVCCMetadata. This will unmarshal to an // empty MVCCMetadata which is sufficient for processKeysAndValues to // determine that there is no intent. vals = [][]byte{nil} } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() gcq.eventLog.Infof(true, "assembled %d transactions from %d old intents; found %d gc'able keys", len(txnMap), intentCount, len(gcArgs.Keys)) txnKeys, err := gcq.processTransactionTable(repl, txnMap, txnExp) if err != nil { return err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcArgs.Keys = append(gcArgs.Keys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup gcq.eventLog.Infof(true, "pushing %d txns", len(txnMap)) for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) go gcq.pushTxn(repl, now, txn, roachpb.PUSH_ABORT, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for txnID, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[txnID] { intents = append(intents, roachpb.Intent{Span: intent, Status: txn.Status, Txn: txn.TxnMeta}) } } } gcq.eventLog.Infof(true, "resolving %d intents", len(intents)) if pErr := repl.store.intentResolver.resolveIntents(repl.context(), repl, intents, true /* wait */, false /* !poison */); pErr != nil { return pErr.GoError() } // Deal with any leftover sequence cache keys. There shouldn't be many of // them. leftoverSeqCacheKeys := gcq.processSequenceCache(repl, now, txnExp, txnMap) gcq.eventLog.Infof(true, "collected %d leftover sequence cache keys", len(leftoverSeqCacheKeys)) gcArgs.Keys = append(gcArgs.Keys, leftoverSeqCacheKeys...) gcq.eventLog.Infof(true, "sending gc request for %d keys", len(gcArgs.Keys)) var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } return nil }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return fmt.Errorf("could not find GC policy for range %s: %s", repl, err) } policy := zone.GC gcMeta := roachpb.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, *policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() gcArgs := &roachpb.GCRequest{} // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*roachpb.Transaction{} intentSpanMap := map[string][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentSpanMap[id] = append(intentSpanMap[id], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err) continue } if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() txnKeys, err := processTransactionTable(repl, txnMap, txnExp) if err != nil { return err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcArgs.Keys = append(gcArgs.Keys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) go pushTxn(repl, now, txn, roachpb.ABORT_TXN, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for id, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[id] { intents = append(intents, roachpb.Intent{Span: intent, Txn: *txn}) } } } if err := repl.resolveIntents(repl.context(), intents, true /* wait */, false /* !poison */); err != nil { return err } // Deal with any leftover sequence cache keys. There shouldn't be many of // them. gcArgs.Keys = append(gcArgs.Keys, processSequenceCache(repl, now, txnExp, txnMap)...) // Send GC request through range. gcArgs.GCMeta = *gcMeta var ba roachpb.BatchRequest // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }
func TestComputeSplits(t *testing.T) { defer leaktest.AfterTest(t)() const ( start = keys.MaxReservedDescID + 1 reservedStart = keys.MaxSystemConfigDescID + 1 ) schema := sqlbase.MakeMetadataSchema() // Real SQL system tables only. baseSql := schema.GetInitialValues() // Real SQL system tables plus some user stuff. userSql := append(schema.GetInitialValues(), descriptor(start), descriptor(start+1), descriptor(start+5)) // Real SQL system with reserved non-system tables. priv := sqlbase.NewDefaultPrivilegeDescriptor() desc1 := sql.CreateTableDescriptor(reservedStart+1, keys.SystemDatabaseID, "CREATE TABLE system.test1 (i INT PRIMARY KEY)", priv) schema.AddDescriptor(keys.SystemDatabaseID, &desc1) desc2 := sql.CreateTableDescriptor(reservedStart+2, keys.SystemDatabaseID, "CREATE TABLE system.test2 (i INT PRIMARY KEY)", priv) schema.AddDescriptor(keys.SystemDatabaseID, &desc2) reservedSql := schema.GetInitialValues() // Real SQL system with reserved non-system and user database. allSql := append(schema.GetInitialValues(), descriptor(start), descriptor(start+1), descriptor(start+5)) sort.Sort(roachpb.KeyValueByKey(allSql)) allUserSplits := []uint32{start, start + 1, start + 2, start + 3, start + 4, start + 5} var allReservedSplits []uint32 for i := 0; i < schema.SystemDescriptorCount()-schema.SystemConfigDescriptorCount(); i++ { allReservedSplits = append(allReservedSplits, reservedStart+uint32(i)) } allSplits := append(allReservedSplits, allUserSplits...) testCases := []struct { values []roachpb.KeyValue start, end roachpb.RKey // Use ints in the testcase definitions, more readable. splits []uint32 }{ // No data. {nil, roachpb.RKeyMin, roachpb.RKeyMax, nil}, {nil, keys.MakeTablePrefix(start), roachpb.RKeyMax, nil}, {nil, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), nil}, {nil, roachpb.RKeyMin, keys.MakeTablePrefix(start + 10), nil}, // No user data. {baseSql, roachpb.RKeyMin, roachpb.RKeyMax, allReservedSplits}, {baseSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, nil}, {baseSql, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), nil}, {baseSql, roachpb.RKeyMin, keys.MakeTablePrefix(start + 10), allReservedSplits}, // User descriptors. {userSql, keys.MakeTablePrefix(start - 1), roachpb.RKeyMax, allUserSplits}, {userSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, allUserSplits[1:]}, {userSql, keys.MakeTablePrefix(start), keys.MakeTablePrefix(start + 10), allUserSplits[1:]}, {userSql, keys.MakeTablePrefix(start - 1), keys.MakeTablePrefix(start + 10), allUserSplits}, {userSql, keys.MakeTablePrefix(start + 4), keys.MakeTablePrefix(start + 10), allUserSplits[5:]}, {userSql, keys.MakeTablePrefix(start + 5), keys.MakeTablePrefix(start + 10), nil}, {userSql, keys.MakeTablePrefix(start + 6), keys.MakeTablePrefix(start + 10), nil}, {userSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), keys.MakeTablePrefix(start + 10), allUserSplits[1:]}, {userSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), keys.MakeTablePrefix(start + 5), allUserSplits[1:5]}, {userSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), testutils.MakeKey(keys.MakeTablePrefix(start+5), roachpb.RKey("bar")), allUserSplits[1:5]}, {userSql, testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("foo")), testutils.MakeKey(keys.MakeTablePrefix(start), roachpb.RKey("morefoo")), nil}, // Reserved descriptors. {reservedSql, roachpb.RKeyMin, roachpb.RKeyMax, allReservedSplits}, {reservedSql, keys.MakeTablePrefix(reservedStart), roachpb.RKeyMax, allReservedSplits[1:]}, {reservedSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, nil}, {reservedSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(start + 10), allReservedSplits[1:]}, {reservedSql, roachpb.RKeyMin, keys.MakeTablePrefix(reservedStart + 2), allReservedSplits[:2]}, {reservedSql, roachpb.RKeyMin, keys.MakeTablePrefix(reservedStart + 10), allReservedSplits}, {reservedSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(reservedStart + 2), allReservedSplits[1:2]}, {reservedSql, testutils.MakeKey(keys.MakeTablePrefix(reservedStart), roachpb.RKey("foo")), testutils.MakeKey(keys.MakeTablePrefix(start+10), roachpb.RKey("foo")), allReservedSplits[1:]}, // Reserved/User mix. {allSql, roachpb.RKeyMin, roachpb.RKeyMax, allSplits}, {allSql, keys.MakeTablePrefix(reservedStart + 1), roachpb.RKeyMax, allSplits[2:]}, {allSql, keys.MakeTablePrefix(start), roachpb.RKeyMax, allUserSplits[1:]}, {allSql, keys.MakeTablePrefix(reservedStart), keys.MakeTablePrefix(start + 10), allSplits[1:]}, {allSql, roachpb.RKeyMin, keys.MakeTablePrefix(start + 2), allSplits[:6]}, {allSql, testutils.MakeKey(keys.MakeTablePrefix(reservedStart), roachpb.RKey("foo")), testutils.MakeKey(keys.MakeTablePrefix(start+5), roachpb.RKey("foo")), allSplits[1:9]}, } cfg := config.SystemConfig{} for tcNum, tc := range testCases { cfg.Values = tc.values splits := cfg.ComputeSplitKeys(tc.start, tc.end) if len(splits) == 0 && len(tc.splits) == 0 { continue } // Convert ints to actual keys. expected := []roachpb.RKey{} for _, s := range tc.splits { expected = append(expected, keys.MakeRowSentinelKey(keys.MakeTablePrefix(s))) } if !reflect.DeepEqual(splits, expected) { t.Errorf("#%d: bad splits:\ngot: %v\nexpected: %v", tcNum, splits, expected) } } }
func (rq *replicateQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg config.SystemConfig) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(*zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return util.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: newStore, err := rq.allocator.AllocateTarget(zone.ReplicaAttrs[0], desc.Replicas, true, nil) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } if err = repl.ChangeReplicas(roachpb.ADD_REPLICA, newReplica, desc); err != nil { return err } case AllocatorRemove: removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas) if err != nil { return err } if err = repl.ChangeReplicas(roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil { return err } // Do not requeue if we removed ourselves. if removeReplica.StoreID == repl.store.StoreID() { return nil } case AllocatorRemoveDead: if len(deadReplicas) == 0 { if log.V(1) { log.Warningf("Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } if err = repl.ChangeReplicas(roachpb.REMOVE_REPLICA, deadReplicas[0], desc); err != nil { return err } case AllocatorNoop: // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. rebalanceStore := rq.allocator.RebalanceTarget(repl.store.StoreID(), zone.ReplicaAttrs[0], desc.Replicas) if rebalanceStore == nil { // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } if err = repl.ChangeReplicas(roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil { return err } } // Enqueue this replica again to see if there are more changes to be made. rq.MaybeAdd(repl, rq.clock.Now()) return nil }