// Does a topo lookup for a single shard, and returns the tablet record of the master tablet. func resolveDestinationShardMaster(ctx context.Context, keyspace, shard string, wr *wrangler.Wrangler) (*topo.TabletInfo, error) { var ti *topo.TabletInfo shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) si, err := topo.GetShard(shortCtx, wr.TopoServer(), keyspace, shard) cancel() if err != nil { return ti, fmt.Errorf("unable to resolve destination shard %v/%v", keyspace, shard) } if si.MasterAlias.IsZero() { return ti, fmt.Errorf("no master in destination shard %v/%v", keyspace, shard) } wr.Logger().Infof("Found target master alias %v in shard %v/%v", si.MasterAlias, keyspace, shard) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) ti, err = topo.GetTablet(shortCtx, wr.TopoServer(), si.MasterAlias) cancel() if err != nil { return ti, fmt.Errorf("unable to get master tablet from alias %v in shard %v/%v", si.MasterAlias, keyspace, shard) } return ti, nil }
// changeCallback is run after every action that might // have changed something in the tablet record. func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topo.Tablet) error { span := trace.NewSpanFromContext(ctx) span.StartLocal("ActionAgent.changeCallback") defer span.Finish() allowQuery := newTablet.IsRunningQueryService() // Read the shard to get SourceShards / TabletControlMap if // we're going to use it. var shardInfo *topo.ShardInfo var tabletControl *pbt.Shard_TabletControl var blacklistedTables []string var err error if allowQuery { shardInfo, err = topo.GetShard(ctx, agent.TopoServer, newTablet.Keyspace, newTablet.Shard) if err != nil { log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err) } else { if newTablet.Type == topo.TYPE_MASTER { allowQuery = len(shardInfo.SourceShards) == 0 } if tc := shardInfo.GetTabletControl(topo.TabletTypeToProto(newTablet.Type)); tc != nil { if topo.InCellList(newTablet.Alias.Cell, tc.Cells) { if tc.DisableQueryService { allowQuery = false } blacklistedTables = tc.BlacklistedTables tabletControl = tc } } } } // Read the keyspace on masters to get ShardingColumnType, // for binlog replication, only if source shards are set. var keyspaceInfo *topo.KeyspaceInfo if newTablet.Type == topo.TYPE_MASTER && shardInfo != nil && len(shardInfo.SourceShards) > 0 { keyspaceInfo, err = agent.TopoServer.GetKeyspace(ctx, newTablet.Keyspace) if err != nil { log.Errorf("Cannot read keyspace for this tablet %v: %v", newTablet.Alias, err) keyspaceInfo = nil } } if allowQuery { // There are a few transitions when we need to restart the query service: switch { // If either InitMaster or InitSlave was called, because those calls // (or a prior call to ResetReplication) may have silently broken the // rowcache invalidator by executing RESET MASTER. // Note that we don't care about fixing it after ResetReplication itself // since that call breaks everything on purpose, and we don't expect // anything to start working until either InitMaster or InitSlave. case agent.initReplication: agent.initReplication = false agent.disallowQueries() // Transitioning from replica to master, so clients that were already // connected don't keep on using the master as replica or rdonly. case newTablet.Type == topo.TYPE_MASTER && oldTablet.Type != topo.TYPE_MASTER: agent.disallowQueries() // Having different parameters for the query service. // It needs to stop and restart with the new parameters. // That includes: // - changing KeyRange // - changing the BlacklistedTables list case (newTablet.KeyRange != oldTablet.KeyRange), !reflect.DeepEqual(blacklistedTables, agent.BlacklistedTables()): agent.disallowQueries() } if err := agent.allowQueries(newTablet, blacklistedTables); err != nil { log.Errorf("Cannot start query service: %v", err) } } else { agent.disallowQueries() } // save the tabletControl we've been using, so the background // healthcheck makes the same decisions as we've been making. agent.setTabletControl(tabletControl) // update stream needs to be started or stopped too if agent.DBConfigs != nil { if topo.IsRunningUpdateStream(newTablet.Type) { binlog.EnableUpdateStreamService(agent.DBConfigs.App.DbName, agent.MysqlDaemon) } else { binlog.DisableUpdateStreamService() } } statsType.Set(string(newTablet.Type)) statsKeyspace.Set(newTablet.Keyspace) statsShard.Set(newTablet.Shard) statsKeyRangeStart.Set(string(newTablet.KeyRange.Start.Hex())) statsKeyRangeEnd.Set(string(newTablet.KeyRange.End.Hex())) // See if we need to start or stop any binlog player if agent.BinlogPlayerMap != nil { if newTablet.Type == topo.TYPE_MASTER { agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, keyspaceInfo, shardInfo) } else { agent.BinlogPlayerMap.StopAllPlayersAndReset() } } return nil }
// changeCallback is run after every action that might // have changed something in the tablet record. func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topo.Tablet) error { span := trace.NewSpanFromContext(ctx) span.StartLocal("ActionAgent.changeCallback") defer span.Finish() allowQuery := newTablet.IsRunningQueryService() // Read the shard to get SourceShards / TabletControlMap if // we're going to use it. var shardInfo *topo.ShardInfo var tabletControl *topo.TabletControl var blacklistedTables []string var err error if allowQuery { shardInfo, err = topo.GetShard(ctx, agent.TopoServer, newTablet.Keyspace, newTablet.Shard) if err != nil { log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err) } else { if newTablet.Type == topo.TYPE_MASTER { allowQuery = len(shardInfo.SourceShards) == 0 } if tc, ok := shardInfo.TabletControlMap[newTablet.Type]; ok { if topo.InCellList(newTablet.Alias.Cell, tc.Cells) { if tc.DisableQueryService { allowQuery = false } blacklistedTables = tc.BlacklistedTables tabletControl = tc } } } } // Read the keyspace on masters to get ShardingColumnType, // for binlog replication, only if source shards are set. var keyspaceInfo *topo.KeyspaceInfo if newTablet.Type == topo.TYPE_MASTER && shardInfo != nil && len(shardInfo.SourceShards) > 0 { keyspaceInfo, err = agent.TopoServer.GetKeyspace(ctx, newTablet.Keyspace) if err != nil { log.Errorf("Cannot read keyspace for this tablet %v: %v", newTablet.Alias, err) keyspaceInfo = nil } } if allowQuery { // There are a few transitions when we're // going to need to restart the query service: // - transitioning from replica to master, so clients // that were already connected don't keep on using // the master as replica or rdonly. // - having different parameters for the query // service. It needs to stop and restart with the // new parameters. That includes: // - changing KeyRange // - changing the BlacklistedTables list if (newTablet.Type == topo.TYPE_MASTER && oldTablet.Type != topo.TYPE_MASTER) || (newTablet.KeyRange != oldTablet.KeyRange) || !reflect.DeepEqual(blacklistedTables, agent.BlacklistedTables()) { agent.disallowQueries() } if err := agent.allowQueries(newTablet, blacklistedTables); err != nil { log.Errorf("Cannot start query service: %v", err) } } else { agent.disallowQueries() } // save the tabletControl we've been using, so the background // healthcheck makes the same decisions as we've been making. agent.setTabletControl(tabletControl) // update stream needs to be started or stopped too if agent.DBConfigs != nil { if topo.IsRunningUpdateStream(newTablet.Type) { binlog.EnableUpdateStreamService(agent.DBConfigs.App.DbName, agent.MysqlDaemon) } else { binlog.DisableUpdateStreamService() } } statsType.Set(string(newTablet.Type)) statsKeyspace.Set(newTablet.Keyspace) statsShard.Set(newTablet.Shard) statsKeyRangeStart.Set(string(newTablet.KeyRange.Start.Hex())) statsKeyRangeEnd.Set(string(newTablet.KeyRange.End.Hex())) // See if we need to start or stop any binlog player if agent.BinlogPlayerMap != nil { if newTablet.Type == topo.TYPE_MASTER { agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, keyspaceInfo, shardInfo) } else { agent.BinlogPlayerMap.StopAllPlayersAndReset() } } return nil }
// TabletExternallyReparented updates all topo records so the current // tablet is the new master for this shard. // Should be called under RPCWrapLock. func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, externalID string) error { startTime := time.Now() // If there is a finalize step running, wait for it to finish or time out // before checking the global shard record again. if agent.finalizeReparentCtx != nil { select { case <-agent.finalizeReparentCtx.Done(): agent.finalizeReparentCtx = nil case <-ctx.Done(): return ctx.Err() } } tablet := agent.Tablet() // Check the global shard record. si, err := topo.GetShard(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard) if err != nil { log.Warningf("fastTabletExternallyReparented: failed to read global shard record for %v/%v: %v", tablet.Keyspace, tablet.Shard, err) return err } if si.MasterAlias == tablet.Alias { // We may get called on the current master even when nothing has changed. // If the global shard record is already updated, it means we successfully // finished a previous reparent to this tablet. return nil } // Create a reusable Reparent event with available info. ev := &events.Reparent{ ShardInfo: *si, NewMaster: *tablet.Tablet, OldMaster: topo.Tablet{Alias: si.MasterAlias, Type: topo.TYPE_MASTER}, ExternalID: externalID, } defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() event.DispatchUpdate(ev, "starting external from tablet (fast)") // Execute state change to master by force-updating only the local copy of the // tablet record. The actual record in topo will be updated later. log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER") oldTablet := *tablet.Tablet newTablet := oldTablet newTablet.Type = topo.TYPE_MASTER newTablet.Health = nil agent.setTablet(topo.NewTabletInfo(&newTablet, -1)) if err := agent.updateState(ctx, &oldTablet, "fastTabletExternallyReparented"); err != nil { return fmt.Errorf("fastTabletExternallyReparented: failed to change tablet state to MASTER: %v", err) } // Directly write the new master endpoint in the serving graph. // We will do a true rebuild in the background soon, but in the meantime, // this will be enough for clients to re-resolve the new master. event.DispatchUpdate(ev, "writing new master endpoint") log.Infof("fastTabletExternallyReparented: writing new master endpoint to serving graph") ep, err := tablet.EndPoint() if err != nil { return fmt.Errorf("fastTabletExternallyReparented: failed to generate EndPoint for tablet %v: %v", tablet.Alias, err) } err = topo.UpdateEndPoints(ctx, agent.TopoServer, tablet.Alias.Cell, si.Keyspace(), si.ShardName(), topo.TYPE_MASTER, &topo.EndPoints{Entries: []topo.EndPoint{*ep}}, -1) if err != nil { return fmt.Errorf("fastTabletExternallyReparented: failed to update master endpoint: %v", err) } externalReparentStats.Record("NewMasterVisible", startTime) // Start the finalize stage with a background context, but connect the trace. bgCtx, cancel := context.WithTimeout(agent.batchCtx, *finalizeReparentTimeout) bgCtx = trace.CopySpan(bgCtx, ctx) agent.finalizeReparentCtx = bgCtx go func() { err := agent.finalizeTabletExternallyReparented(bgCtx, si, ev) cancel() if err != nil { log.Warningf("finalizeTabletExternallyReparented error: %v", err) event.DispatchUpdate(ev, "failed: "+err.Error()) return } externalReparentStats.Record("FullRebuild", startTime) }() return nil }
// CheckShard verifies the Shard operations work correctly func CheckShard(ctx context.Context, t *testing.T, ts topo.Server) { if err := ts.CreateKeyspace(ctx, "test_keyspace", &topo.Keyspace{}); err != nil { t.Fatalf("CreateKeyspace: %v", err) } if err := topo.CreateShard(ctx, ts, "test_keyspace", "b0-c0"); err != nil { t.Fatalf("CreateShard: %v", err) } if err := topo.CreateShard(ctx, ts, "test_keyspace", "b0-c0"); err != topo.ErrNodeExists { t.Errorf("CreateShard called second time, got: %v", err) } // Delete shard and see if we can re-create it. if err := ts.DeleteShard(ctx, "test_keyspace", "b0-c0"); err != nil { t.Fatalf("DeleteShard: %v", err) } if err := topo.CreateShard(ctx, ts, "test_keyspace", "b0-c0"); err != nil { t.Fatalf("CreateShard: %v", err) } // Delete ALL shards. if err := ts.DeleteKeyspaceShards(ctx, "test_keyspace"); err != nil { t.Fatalf("DeleteKeyspaceShards: %v", err) } if err := topo.CreateShard(ctx, ts, "test_keyspace", "b0-c0"); err != nil { t.Fatalf("CreateShard: %v", err) } if _, err := topo.GetShard(ctx, ts, "test_keyspace", "666"); err != topo.ErrNoNode { t.Errorf("GetShard(666): %v", err) } shardInfo, err := topo.GetShard(ctx, ts, "test_keyspace", "b0-c0") if err != nil { t.Errorf("GetShard: %v", err) } if want := newKeyRange("b0-c0"); shardInfo.KeyRange != want { t.Errorf("shardInfo.KeyRange: want %v, got %v", want, shardInfo.KeyRange) } master := topo.TabletAlias{Cell: "ny", Uid: 1} shardInfo.MasterAlias = master shardInfo.KeyRange = newKeyRange("b0-c0") shardInfo.ServedTypesMap = map[topo.TabletType]*topo.ShardServedType{ topo.TYPE_MASTER: &topo.ShardServedType{}, topo.TYPE_REPLICA: &topo.ShardServedType{Cells: []string{"c1"}}, topo.TYPE_RDONLY: &topo.ShardServedType{}, } shardInfo.SourceShards = []topo.SourceShard{ topo.SourceShard{ Uid: 1, Keyspace: "source_ks", Shard: "b8-c0", KeyRange: newKeyRange("b8-c0"), Tables: []string{"table1", "table2"}, }, } shardInfo.TabletControlMap = map[topo.TabletType]*topo.TabletControl{ topo.TYPE_MASTER: &topo.TabletControl{ Cells: []string{"c1", "c2"}, BlacklistedTables: []string{"black1", "black2"}, }, topo.TYPE_REPLICA: &topo.TabletControl{ DisableQueryService: true, }, } if err := topo.UpdateShard(ctx, ts, shardInfo); err != nil { t.Errorf("UpdateShard: %v", err) } other := topo.TabletAlias{Cell: "ny", Uid: 82873} _, err = topo.UpdateShardFields(ctx, ts, "test_keyspace", "b0-c0", func(shard *topo.Shard) error { shard.MasterAlias = other return nil }) if err != nil { t.Fatalf("UpdateShardFields error: %v", err) } si, err := topo.GetShard(ctx, ts, "test_keyspace", "b0-c0") if err != nil { t.Fatalf("GetShard: %v", err) } if si.MasterAlias != other { t.Fatalf("shard.MasterAlias = %v, want %v", si.MasterAlias, other) } _, err = topo.UpdateShardFields(ctx, ts, "test_keyspace", "b0-c0", func(shard *topo.Shard) error { shard.MasterAlias = master return nil }) if err != nil { t.Fatalf("UpdateShardFields error: %v", err) } updatedShardInfo, err := topo.GetShard(ctx, ts, "test_keyspace", "b0-c0") if err != nil { t.Fatalf("GetShard: %v", err) } if eq, err := shardEqual(shardInfo.Shard, updatedShardInfo.Shard); err != nil { t.Errorf("cannot compare shards: %v", err) } else if !eq { t.Errorf("put and got shards are not identical:\n%#v\n%#v", shardInfo.Shard, updatedShardInfo.Shard) } // test GetShardNames shards, err := ts.GetShardNames(ctx, "test_keyspace") if err != nil { t.Errorf("GetShardNames: %v", err) } if len(shards) != 1 || shards[0] != "b0-c0" { t.Errorf(`GetShardNames: want [ "b0-c0" ], got %v`, shards) } if _, err := ts.GetShardNames(ctx, "test_keyspace666"); err != topo.ErrNoNode { t.Errorf("GetShardNames(666): %v", err) } // test ValidateShard if err := ts.ValidateShard(ctx, "test_keyspace", "b0-c0"); err != nil { t.Errorf("ValidateShard(test_keyspace, b0-c0) failed: %v", err) } }