func (client *fakeTabletManagerClient) GetSchema(ctx context.Context, tablet *topodatapb.Tablet, tables, excludeTables []string, includeViews bool) (*tabletmanagerdatapb.SchemaDefinition, error) { result, ok := client.schemaDefinitions[topoproto.TabletDbName(tablet)] if !ok { return nil, fmt.Errorf("unknown database: %s", topoproto.TabletDbName(tablet)) } return result, nil }
// loadBlacklistRules loads and builds the blacklist query rules func (agent *ActionAgent) loadBlacklistRules(tablet *topodatapb.Tablet, blacklistedTables []string) (err error) { blacklistRules := tabletserver.NewQueryRules() if len(blacklistedTables) > 0 { // tables, first resolve wildcards tables, err := mysqlctl.ResolveTables(agent.MysqlDaemon, topoproto.TabletDbName(tablet), blacklistedTables) if err != nil { return err } // Verify that at least one table matches the wildcards, so // that we don't add a rule to blacklist all tables if len(tables) > 0 { log.Infof("Blacklisting tables %v", strings.Join(tables, ", ")) qr := tabletserver.NewQueryRule("enforce blacklisted tables", "blacklisted_table", tabletserver.QRFailRetry) for _, t := range tables { qr.AddTableCond(t) } blacklistRules.Add(qr) } } loadRuleErr := agent.QueryServiceControl.SetQueryRules(blacklistQueryRules, blacklistRules) if loadRuleErr != nil { log.Warningf("Fail to load query rule set %s: %s", blacklistQueryRules, loadRuleErr) } return nil }
// findDestinationMasters finds for each destination shard the current master. func (scw *SplitCloneWorker) findDestinationMasters(ctx context.Context) error { scw.setState(WorkerStateFindTargets) // Make sure we find a master for each destination shard and log it. scw.wr.Logger().Infof("Finding a MASTER tablet for each destination shard...") for _, si := range scw.destinationShards { waitCtx, waitCancel := context.WithTimeout(ctx, *waitForHealthyTabletsTimeout) defer waitCancel() if err := scw.tsc.WaitForTablets(waitCtx, scw.cell, si.Keyspace(), si.ShardName(), []topodatapb.TabletType{topodatapb.TabletType_MASTER}); err != nil { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v (in cell: %v): %v", si.Keyspace(), si.ShardName(), scw.cell, err) } masters := scw.tsc.GetHealthyTabletStats(si.Keyspace(), si.ShardName(), topodatapb.TabletType_MASTER) if len(masters) == 0 { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v (in cell: %v) in HealthCheck: empty TabletStats list", si.Keyspace(), si.ShardName(), scw.cell) } master := masters[0] // Get the MySQL database name of the tablet. keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()) scw.destinationDbNames[keyspaceAndShard] = topoproto.TabletDbName(master.Tablet) // TODO(mberlin): Verify on the destination master that the // _vt.blp_checkpoint table has the latest schema. scw.wr.Logger().Infof("Using tablet %v as destination master for %v/%v", topoproto.TabletAliasString(master.Tablet.Alias), si.Keyspace(), si.ShardName()) } scw.wr.Logger().Infof("NOTE: The used master of a destination shard might change over the course of the copy e.g. due to a reparent. The HealthCheck module will track and log master changes and any error message will always refer the actually used master address.") return nil }
func (agent *ActionAgent) allowQueries(tablet *pbt.Tablet, blacklistedTables []string) error { // if the query service is already running, we're not starting it again if agent.QueryServiceControl.IsServing() { return nil } // only for real instances if agent.DBConfigs != nil { // Update our DB config to match the info we have in the tablet if agent.DBConfigs.App.DbName == "" { agent.DBConfigs.App.DbName = topoproto.TabletDbName(tablet) } agent.DBConfigs.App.Keyspace = tablet.Keyspace agent.DBConfigs.App.Shard = tablet.Shard if tablet.Type != pbt.TabletType_MASTER { agent.DBConfigs.App.EnableInvalidator = true } else { agent.DBConfigs.App.EnableInvalidator = false } } err := agent.loadKeyspaceAndBlacklistRules(tablet, blacklistedTables) if err != nil { return err } return agent.QueryServiceControl.StartService(&pb.Target{ Keyspace: tablet.Keyspace, Shard: tablet.Shard, TabletType: tablet.Type, }, agent.DBConfigs, agent.SchemaOverrides, agent.MysqlDaemon) }
// PreflightSchema will try out the schema change // Should be called under RPCWrapLockAction. func (agent *ActionAgent) PreflightSchema(ctx context.Context, change string) (*tmutils.SchemaChangeResult, error) { // get the db name from the tablet dbName := topoproto.TabletDbName(agent.Tablet()) // and preflight the change return agent.MysqlDaemon.PreflightSchemaChange(dbName, change) }
// ExecuteFetchAsDba is part of the tmclient.TabletManagerClient interface. func (client *Client) ExecuteFetchAsDba(ctx context.Context, tablet *topodatapb.Tablet, usePool bool, query []byte, maxRows int, disableBinlogs, reloadSchema bool) (*querypb.QueryResult, error) { var c tabletmanagerservicepb.TabletManagerClient var err error if usePool { c, err = client.dialPool(tablet) if err != nil { return nil, err } } else { var cc *grpc.ClientConn cc, c, err = client.dial(tablet) if err != nil { return nil, err } defer cc.Close() } response, err := c.ExecuteFetchAsDba(ctx, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ Query: query, DbName: topoproto.TabletDbName(tablet), MaxRows: uint64(maxRows), DisableBinlogs: disableBinlogs, ReloadSchema: reloadSchema, }) if err != nil { return nil, err } return response.Result, nil }
// loadKeyspaceAndBlacklistRules does what the name suggests: // 1. load and build keyrange query rules // 2. load and build blacklist query rules func (agent *ActionAgent) loadKeyspaceAndBlacklistRules(tablet *pbt.Tablet, blacklistedTables []string) (err error) { // Keyrange rules keyrangeRules := tabletserver.NewQueryRules() if key.KeyRangeIsPartial(tablet.KeyRange) { log.Infof("Restricting to keyrange: %v", tablet.KeyRange) dmlPlans := []struct { planID planbuilder.PlanType onAbsent bool }{ {planbuilder.PlanInsertPK, true}, {planbuilder.PlanInsertSubquery, true}, {planbuilder.PlanPassDML, false}, {planbuilder.PlanDMLPK, false}, {planbuilder.PlanDMLSubquery, false}, {planbuilder.PlanUpsertPK, false}, } for _, plan := range dmlPlans { qr := tabletserver.NewQueryRule( fmt.Sprintf("enforce keyspace_id range for %v", plan.planID), fmt.Sprintf("keyspace_id_not_in_range_%v", plan.planID), tabletserver.QRFail, ) qr.AddPlanCond(plan.planID) err := qr.AddBindVarCond("keyspace_id", plan.onAbsent, true, tabletserver.QRNotIn, tablet.KeyRange) if err != nil { return fmt.Errorf("Unable to add keyspace rule: %v", err) } keyrangeRules.Add(qr) } } // Blacklisted tables blacklistRules := tabletserver.NewQueryRules() if len(blacklistedTables) > 0 { // tables, first resolve wildcards tables, err := mysqlctl.ResolveTables(agent.MysqlDaemon, topoproto.TabletDbName(tablet), blacklistedTables) if err != nil { return err } log.Infof("Blacklisting tables %v", strings.Join(tables, ", ")) qr := tabletserver.NewQueryRule("enforce blacklisted tables", "blacklisted_table", tabletserver.QRFailRetry) for _, t := range tables { qr.AddTableCond(t) } blacklistRules.Add(qr) } // Push all three sets of QueryRules to TabletServerRpcService loadRuleErr := agent.QueryServiceControl.SetQueryRules(keyrangeQueryRules, keyrangeRules) if loadRuleErr != nil { log.Warningf("Fail to load query rule set %s: %s", keyrangeQueryRules, loadRuleErr) } loadRuleErr = agent.QueryServiceControl.SetQueryRules(blacklistQueryRules, blacklistRules) if loadRuleErr != nil { log.Warningf("Fail to load query rule set %s: %s", blacklistQueryRules, loadRuleErr) } return nil }
// PreflightSchema will try out the schema changes in "changes". func (agent *ActionAgent) PreflightSchema(ctx context.Context, changes []string) ([]*tabletmanagerdatapb.SchemaChangeResult, error) { if err := agent.lock(ctx); err != nil { return nil, err } defer agent.unlock() // get the db name from the tablet dbName := topoproto.TabletDbName(agent.Tablet()) // and preflight the change return agent.MysqlDaemon.PreflightSchemaChange(dbName, changes) }
// ApplySchema will apply a schema change // Should be called under RPCWrapLockAction. func (agent *ActionAgent) ApplySchema(ctx context.Context, change *tmutils.SchemaChange) (*tmutils.SchemaChangeResult, error) { // get the db name from the tablet dbName := topoproto.TabletDbName(agent.Tablet()) // apply the change scr, err := agent.MysqlDaemon.ApplySchemaChange(dbName, change) if err != nil { return nil, err } // and if it worked, reload the schema agent.ReloadSchema(ctx) return scr, nil }
// ExecuteFetchAsDba is part of the tmclient.TabletManagerClient interface. func (client *Client) ExecuteFetchAsDba(ctx context.Context, tablet *topodatapb.Tablet, query []byte, maxRows int, disableBinlogs, reloadSchema bool) (*querypb.QueryResult, error) { c, err := client.dialPool(tablet) if err != nil { return nil, err } response, err := c.ExecuteFetchAsDba(ctx, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ Query: query, DbName: topoproto.TabletDbName(tablet), MaxRows: uint64(maxRows), DisableBinlogs: disableBinlogs, ReloadSchema: reloadSchema, }) if err != nil { return nil, err } return response.Result, nil }
// RefreshMap reads the right data from topo.Server and makes sure // we're playing the right logs. func (blm *BinlogPlayerMap) RefreshMap(ctx context.Context, tablet *pb.Tablet, keyspaceInfo *topo.KeyspaceInfo, shardInfo *topo.ShardInfo) { log.Infof("Refreshing map of binlog players") if shardInfo == nil { log.Warningf("Could not read shardInfo, not changing anything") return } if len(shardInfo.SourceShards) > 0 && keyspaceInfo == nil { log.Warningf("Could not read keyspaceInfo, not changing anything") return } blm.mu.Lock() if blm.dbConfig.DbName == "" { blm.dbConfig.DbName = topoproto.TabletDbName(tablet) } // get the existing sources and build a map of sources to remove toRemove := make(map[uint32]bool) hadPlayers := false for source := range blm.players { toRemove[source] = true hadPlayers = true } // for each source, add it if not there, and delete from toRemove for _, sourceShard := range shardInfo.SourceShards { blm.addPlayer(ctx, tablet.Alias.Cell, keyspaceInfo.ShardingColumnType, tablet.KeyRange, sourceShard, topoproto.TabletDbName(tablet)) delete(toRemove, sourceShard.Uid) } hasPlayers := len(shardInfo.SourceShards) > 0 // remove all entries from toRemove for source := range toRemove { blm.players[source].Stop() delete(blm.players, source) } blm.mu.Unlock() if hadPlayers && !hasPlayers { // We're done streaming, so turn off special playback settings. blm.mysqld.DisableBinlogPlayback() } }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'worker' pointing back to us // - get the aliases of all the targets func (vscw *VerticalSplitCloneWorker) findTargets(ctx context.Context) error { vscw.setState(WorkerStateFindTargets) // find an appropriate tablet in the source shard var err error vscw.sourceAlias, err = FindWorkerTablet(ctx, vscw.wr, vscw.cleaner, nil /* tsc */, vscw.cell, vscw.sourceKeyspace, "0", vscw.minHealthyRdonlyTablets) if err != nil { return fmt.Errorf("FindWorkerTablet() failed for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err) } vscw.wr.Logger().Infof("Using tablet %v as the source", topoproto.TabletAliasString(vscw.sourceAlias)) // get the tablet info for it shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) ti, err := vscw.wr.TopoServer().GetTablet(shortCtx, vscw.sourceAlias) cancel() if err != nil { return fmt.Errorf("cannot read tablet %v: %v", topoproto.TabletAliasString(vscw.sourceAlias), err) } vscw.sourceTablet = ti.Tablet // stop replication on it shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = vscw.wr.TabletManagerClient().StopSlave(shortCtx, vscw.sourceTablet) cancel() if err != nil { return fmt.Errorf("cannot stop replication on tablet %v", topoproto.TabletAliasString(vscw.sourceAlias)) } wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet) // Initialize healthcheck and add destination shards to it. vscw.healthCheck = discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout) vscw.tsc = discovery.NewTabletStatsCache(vscw.healthCheck, vscw.cell) watcher := discovery.NewShardReplicationWatcher(vscw.wr.TopoServer(), vscw.healthCheck, vscw.cell, vscw.destinationKeyspace, vscw.destinationShard, *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) vscw.destinationShardWatchers = append(vscw.destinationShardWatchers, watcher) // Make sure we find a master for each destination shard and log it. vscw.wr.Logger().Infof("Finding a MASTER tablet for each destination shard...") waitCtx, waitCancel := context.WithTimeout(ctx, *waitForHealthyTabletsTimeout) defer waitCancel() if err := vscw.tsc.WaitForTablets(waitCtx, vscw.cell, vscw.destinationKeyspace, vscw.destinationShard, []topodatapb.TabletType{topodatapb.TabletType_MASTER}); err != nil { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v (in cell: %v): %v", vscw.destinationKeyspace, vscw.destinationShard, vscw.cell, err) } masters := vscw.tsc.GetHealthyTabletStats(vscw.destinationKeyspace, vscw.destinationShard, topodatapb.TabletType_MASTER) if len(masters) == 0 { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v (in cell: %v) in HealthCheck: empty TabletStats list", vscw.destinationKeyspace, vscw.destinationShard, vscw.cell) } master := masters[0] // Get the MySQL database name of the tablet. keyspaceAndShard := topoproto.KeyspaceShardString(vscw.destinationKeyspace, vscw.destinationShard) vscw.destinationDbNames[keyspaceAndShard] = topoproto.TabletDbName(master.Tablet) // TODO(mberlin): Verify on the destination master that the // _vt.blp_checkpoint table has the latest schema. vscw.wr.Logger().Infof("Using tablet %v as destination master for %v/%v", topoproto.TabletAliasString(master.Tablet.Alias), vscw.destinationKeyspace, vscw.destinationShard) vscw.wr.Logger().Infof("NOTE: The used master of a destination shard might change over the course of the copy e.g. due to a reparent. The HealthCheck module will track and log master changes and any error message will always refer the actually used master address.") return nil }
// Start validates and updates the topology records for the tablet, and performs // the initial state change callback to start tablet services. func (agent *ActionAgent) Start(ctx context.Context, mysqlPort, vtPort, gRPCPort int32) error { var err error if _, err = agent.readTablet(ctx); err != nil { return err } // find our hostname as fully qualified, and IP hostname := *tabletHostname if hostname == "" { hostname, err = netutil.FullyQualifiedHostname() if err != nil { return err } } ipAddrs, err := net.LookupHost(hostname) if err != nil { return err } ipAddr := ipAddrs[0] // Update bind addr for mysql and query service in the tablet node. f := func(tablet *pb.Tablet) error { tablet.Hostname = hostname tablet.Ip = ipAddr if tablet.PortMap == nil { tablet.PortMap = make(map[string]int32) } if mysqlPort != 0 { // only overwrite mysql port if we know it, otherwise // leave it as is. tablet.PortMap["mysql"] = mysqlPort } if vtPort != 0 { tablet.PortMap["vt"] = vtPort } else { delete(tablet.PortMap, "vt") } delete(tablet.PortMap, "vts") if gRPCPort != 0 { tablet.PortMap["grpc"] = gRPCPort } else { delete(tablet.PortMap, "grpc") } return nil } if err := agent.TopoServer.UpdateTabletFields(ctx, agent.Tablet().Alias, f); err != nil { return err } // Reread to get the changes we just made tablet, err := agent.readTablet(ctx) if err != nil { return err } if err = agent.verifyTopology(ctx); err != nil { return err } if err = agent.verifyServingAddrs(ctx); err != nil { return err } // initialize tablet server if !agent.DBConfigs.IsZero() { // Only for real instances // Update our DB config to match the info we have in the tablet if agent.DBConfigs.App.DbName == "" { agent.DBConfigs.App.DbName = topoproto.TabletDbName(tablet.Tablet) } agent.DBConfigs.App.Keyspace = tablet.Keyspace agent.DBConfigs.App.Shard = tablet.Shard } if err := agent.QueryServiceControl.InitDBConfig(pbq.Target{ Keyspace: tablet.Keyspace, Shard: tablet.Shard, TabletType: tablet.Type, }, agent.DBConfigs, agent.SchemaOverrides, agent.MysqlDaemon); err != nil { return fmt.Errorf("failed to InitDBConfig: %v", err) } // and update our state oldTablet := &pb.Tablet{} if err = agent.updateState(ctx, oldTablet, "Start"); err != nil { log.Warningf("Initial updateState failed, will need a state change before running properly: %v", err) } return nil }
func (agent *ActionAgent) restoreDataLocked(ctx context.Context, logger logutil.Logger, deleteBeforeRestore bool) error { // change type to RESTORE (using UpdateTabletFields so it's // always authorized) var originalType topodatapb.TabletType if _, err := agent.TopoServer.UpdateTabletFields(ctx, agent.TabletAlias, func(tablet *topodatapb.Tablet) error { originalType = tablet.Type tablet.Type = topodatapb.TabletType_RESTORE return nil }); err != nil { return fmt.Errorf("Cannot change type to RESTORE: %v", err) } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet(ctx, "restore from backup"); err != nil { return fmt.Errorf("failed to update state before restore: %v", err) } // Try to restore. Depending on the reason for failure, we may be ok. // If we're not ok, return an error and the agent will log.Fatalf, // causing the process to be restarted and the restore retried. // Record local metadata values based on the original type. localMetadata := agent.getLocalMetadataValues(originalType) tablet := agent.Tablet() dir := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard) pos, err := mysqlctl.Restore(ctx, agent.MysqlDaemon, dir, *restoreConcurrency, agent.hookExtraEnv(), localMetadata, logger, deleteBeforeRestore, topoproto.TabletDbName(tablet)) switch err { case nil: // Reconnect to master. if err := agent.startReplication(ctx, pos); err != nil { return err } case mysqlctl.ErrNoBackup: // No-op, starting with empty database. case mysqlctl.ErrExistingDB: // No-op, assuming we've just restarted. default: return fmt.Errorf("Can't restore backup: %v", err) } // Change type back to original type if we're ok to serve. if _, err := agent.TopoServer.UpdateTabletFields(ctx, tablet.Alias, func(tablet *topodatapb.Tablet) error { tablet.Type = originalType return nil }); err != nil { return fmt.Errorf("Cannot change type back to %v: %v", originalType, err) } // let's update our internal state (start query service and other things) if err := agent.refreshTablet(ctx, "after restore from backup"); err != nil { return fmt.Errorf("failed to update state after backup: %v", err) } return nil }
// GetSchema returns the schema. // Should be called under RPCWrap. func (agent *ActionAgent) GetSchema(ctx context.Context, tables, excludeTables []string, includeViews bool) (*tabletmanagerdatapb.SchemaDefinition, error) { return agent.MysqlDaemon.GetSchema(topoproto.TabletDbName(agent.Tablet()), tables, excludeTables, includeViews) }
// Start validates and updates the topology records for the tablet, and performs // the initial state change callback to start tablet services. // If initUpdateStream is set, update stream service will also be registered. func (agent *ActionAgent) Start(ctx context.Context, mysqlPort, vtPort, gRPCPort int32, initUpdateStream bool) error { // find our hostname as fully qualified, and IP hostname := *tabletHostname if hostname == "" { var err error hostname, err = netutil.FullyQualifiedHostname() if err != nil { return err } } ipAddrs, err := net.LookupHost(hostname) if err != nil { return err } ipAddr := ipAddrs[0] // Update bind addr for mysql and query service in the tablet node. f := func(tablet *topodatapb.Tablet) error { tablet.Hostname = hostname tablet.Ip = ipAddr if tablet.PortMap == nil { tablet.PortMap = make(map[string]int32) } if mysqlPort != 0 { // only overwrite mysql port if we know it, otherwise // leave it as is. tablet.PortMap["mysql"] = mysqlPort } if vtPort != 0 { tablet.PortMap["vt"] = vtPort } else { delete(tablet.PortMap, "vt") } delete(tablet.PortMap, "vts") if gRPCPort != 0 { tablet.PortMap["grpc"] = gRPCPort } else { delete(tablet.PortMap, "grpc") } // Save the original tablet for ownership tests later. agent.initialTablet = tablet return nil } if _, err := agent.TopoServer.UpdateTabletFields(ctx, agent.TabletAlias, f); err != nil { return err } // Verify the topology is correct. agent.verifyTopology(ctx) // Get and fix the dbname if necessary, only for real instances. if !agent.DBConfigs.IsZero() { dbname := topoproto.TabletDbName(agent.initialTablet) // Update our DB config to match the info we have in the tablet if agent.DBConfigs.App.DbName == "" { agent.DBConfigs.App.DbName = dbname } if agent.DBConfigs.Filtered.DbName == "" { agent.DBConfigs.Filtered.DbName = dbname } } // create and register the RPC services from UpdateStream // (it needs the dbname, so it has to be delayed up to here, // but it has to be before updateState below that may use it) if initUpdateStream { us := binlog.NewUpdateStream(agent.MysqlDaemon, agent.DBConfigs.App.DbName) agent.UpdateStream = us servenv.OnRun(func() { us.RegisterService() }) } servenv.OnTerm(func() { // Disable UpdateStream (if any) upon entering lameduck. // We do this regardless of initUpdateStream, since agent.UpdateStream // may have been set from elsewhere. if agent.UpdateStream != nil { agent.UpdateStream.Disable() } }) // initialize tablet server if err := agent.QueryServiceControl.InitDBConfig(querypb.Target{ Keyspace: agent.initialTablet.Keyspace, Shard: agent.initialTablet.Shard, TabletType: agent.initialTablet.Type, }, agent.DBConfigs, agent.MysqlDaemon); err != nil { return fmt.Errorf("failed to InitDBConfig: %v", err) } // export a few static variables if agent.exportStats { statsKeyspace := stats.NewString("TabletKeyspace") statsShard := stats.NewString("TabletShard") statsKeyRangeStart := stats.NewString("TabletKeyRangeStart") statsKeyRangeEnd := stats.NewString("TabletKeyRangeEnd") statsKeyspace.Set(agent.initialTablet.Keyspace) statsShard.Set(agent.initialTablet.Shard) if key.KeyRangeIsPartial(agent.initialTablet.KeyRange) { statsKeyRangeStart.Set(hex.EncodeToString(agent.initialTablet.KeyRange.Start)) statsKeyRangeEnd.Set(hex.EncodeToString(agent.initialTablet.KeyRange.End)) } } // Initialize the current tablet to match our current running // state: Has most field filled in, but type is UNKNOWN. // Subsequents calls to updateState or refreshTablet // will then work as expected. startingTablet := proto.Clone(agent.initialTablet).(*topodatapb.Tablet) startingTablet.Type = topodatapb.TabletType_UNKNOWN agent.setTablet(startingTablet) // run a background task to rebuild the SrvKeyspace in our cell/keyspace // if it doesn't exist yet go agent.maybeRebuildKeyspace(agent.initialTablet.Alias.Cell, agent.initialTablet.Keyspace) return nil }
// Start validates and updates the topology records for the tablet, and performs // the initial state change callback to start tablet services. // If initUpdateStream is set, update stream service will also be registered. func (agent *ActionAgent) Start(ctx context.Context, mysqlPort, vtPort, gRPCPort int32, initUpdateStream bool) error { var err error if _, err = agent.updateTabletFromTopo(ctx); err != nil { return err } // find our hostname as fully qualified, and IP hostname := *tabletHostname if hostname == "" { hostname, err = netutil.FullyQualifiedHostname() if err != nil { return err } } ipAddrs, err := net.LookupHost(hostname) if err != nil { return err } ipAddr := ipAddrs[0] // Update bind addr for mysql and query service in the tablet node. f := func(tablet *pb.Tablet) error { tablet.Hostname = hostname tablet.Ip = ipAddr if tablet.PortMap == nil { tablet.PortMap = make(map[string]int32) } if mysqlPort != 0 { // only overwrite mysql port if we know it, otherwise // leave it as is. tablet.PortMap["mysql"] = mysqlPort } if vtPort != 0 { tablet.PortMap["vt"] = vtPort } else { delete(tablet.PortMap, "vt") } delete(tablet.PortMap, "vts") if gRPCPort != 0 { tablet.PortMap["grpc"] = gRPCPort } else { delete(tablet.PortMap, "grpc") } return nil } if err := agent.TopoServer.UpdateTabletFields(ctx, agent.Tablet().Alias, f); err != nil { return err } // Reread to get the changes we just made tablet, err := agent.updateTabletFromTopo(ctx) if err != nil { return err } if err = agent.verifyTopology(ctx); err != nil { return err } if err = agent.verifyServingAddrs(ctx); err != nil { return err } // get and fix the dbname if necessary if !agent.DBConfigs.IsZero() { // Only for real instances // Update our DB config to match the info we have in the tablet if agent.DBConfigs.App.DbName == "" { agent.DBConfigs.App.DbName = topoproto.TabletDbName(tablet.Tablet) } agent.DBConfigs.App.Keyspace = tablet.Keyspace agent.DBConfigs.App.Shard = tablet.Shard } // create and register the RPC services from UpdateStream // (it needs the dbname, so it has to be delayed up to here, // but it has to be before updateState below that may use it) if initUpdateStream { us := binlog.NewUpdateStream(agent.MysqlDaemon, agent.DBConfigs.App.DbName) us.RegisterService() agent.UpdateStream = us } // initialize tablet server if err := agent.QueryServiceControl.InitDBConfig(pbq.Target{ Keyspace: tablet.Keyspace, Shard: tablet.Shard, TabletType: tablet.Type, }, agent.DBConfigs, agent.SchemaOverrides, agent.MysqlDaemon); err != nil { return fmt.Errorf("failed to InitDBConfig: %v", err) } // export a few static variables if agent.exportStats { statsKeyspace := stats.NewString("TabletKeyspace") statsShard := stats.NewString("TabletShard") statsKeyRangeStart := stats.NewString("TabletKeyRangeStart") statsKeyRangeEnd := stats.NewString("TabletKeyRangeEnd") statsKeyspace.Set(tablet.Keyspace) statsShard.Set(tablet.Shard) if key.KeyRangeIsPartial(tablet.KeyRange) { statsKeyRangeStart.Set(hex.EncodeToString(tablet.KeyRange.Start)) statsKeyRangeEnd.Set(hex.EncodeToString(tablet.KeyRange.End)) } } // initialize the key range query rule if err := agent.initializeKeyRangeRule(ctx, tablet.Keyspace, tablet.KeyRange); err != nil { return err } // and update our state oldTablet := &pb.Tablet{} if err = agent.updateState(ctx, oldTablet, "Start"); err != nil { log.Warningf("Initial updateState failed, will need a state change before running properly: %v", err) } return nil }
// DbName is usually implied by keyspace. Having the shard information in the // database name complicates mysql replication. func (ti *TabletInfo) DbName() string { return topoproto.TabletDbName(ti.Tablet) }
// generateChunks returns an array of chunks to use for splitting up a table // into multiple data chunks. It only works for tables with a primary key // whose first column is a numeric type. func generateChunks(ctx context.Context, wr *wrangler.Wrangler, tablet *topodatapb.Tablet, td *tabletmanagerdatapb.TableDefinition, minTableSizeForSplit uint64, chunkCount int) ([]chunk, error) { if len(td.PrimaryKeyColumns) == 0 { // No explicit primary key. Cannot chunk the rows then. return singleCompleteChunk, nil } if td.DataLength < minTableSizeForSplit { // Table is too small to split up. return singleCompleteChunk, nil } if chunkCount == 1 { return singleCompleteChunk, nil } // Get the MIN and MAX of the leading column of the primary key. query := fmt.Sprintf("SELECT MIN(%v), MAX(%v) FROM %v.%v", td.PrimaryKeyColumns[0], td.PrimaryKeyColumns[0], topoproto.TabletDbName(tablet), td.Name) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) qr, err := wr.TabletManagerClient().ExecuteFetchAsApp(shortCtx, tablet, true, []byte(query), 1) cancel() if err != nil { return nil, fmt.Errorf("Cannot determine MIN and MAX of the first primary key column. ExecuteFetchAsApp: %v", err) } if len(qr.Rows) != 1 { return nil, fmt.Errorf("Cannot determine MIN and MAX of the first primary key column. Zero rows were returned for the following query: %v", query) } result := sqltypes.Proto3ToResult(qr) min := result.Rows[0][0].ToNative() max := result.Rows[0][1].ToNative() if min == nil || max == nil { wr.Logger().Infof("Not splitting table %v into multiple chunks, min or max is NULL: %v", td.Name, qr.Rows[0]) return singleCompleteChunk, nil } // TODO(mberlin): Write a unit test for this part of the function. chunks := make([]chunk, chunkCount) switch min := min.(type) { case int64: max := max.(int64) interval := (max - min) / int64(chunkCount) if interval == 0 { wr.Logger().Infof("Not splitting table %v into multiple chunks, interval=0: %v to %v", td.Name, min, max) return singleCompleteChunk, nil } start := min for i := 0; i < chunkCount; i++ { end := start + interval chunk, err := toChunk(start, end) if err != nil { return nil, err } chunks[i] = chunk start = end } case uint64: max := max.(uint64) interval := (max - min) / uint64(chunkCount) if interval == 0 { wr.Logger().Infof("Not splitting table %v into multiple chunks, interval=0: %v to %v", td.Name, min, max) return singleCompleteChunk, nil } start := min for i := 0; i < chunkCount; i++ { end := start + interval chunk, err := toChunk(start, end) if err != nil { return nil, err } chunks[i] = chunk start = end } case float64: max := max.(float64) interval := (max - min) / float64(chunkCount) if interval == 0 { wr.Logger().Infof("Not splitting table %v into multiple chunks, interval=0: %v to %v", td.Name, min, max) return singleCompleteChunk, nil } start := min for i := 0; i < chunkCount; i++ { end := start + interval chunk, err := toChunk(start, end) if err != nil { return nil, err } chunks[i] = chunk start = end } default: wr.Logger().Infof("Not splitting table %v into multiple chunks, primary key not numeric.", td.Name) return singleCompleteChunk, nil } // Clear out the MIN and MAX on the first and last chunk respectively // because other shards might have smaller or higher values than the one we // looked at. chunks[0].start = sqltypes.NULL chunks[chunkCount-1].end = sqltypes.NULL return chunks, nil }
func (wr *Wrangler) initShardMasterLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, masterElectTabletAlias *topodatapb.TabletAlias, force bool, waitSlaveTimeout time.Duration) error { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } ev.ShardInfo = *shardInfo event.DispatchUpdate(ev, "reading tablet map") tabletMap, err := wr.ts.GetTabletMapForShard(ctx, keyspace, shard) if err != nil { return err } // Check the master elect is in tabletMap masterElectTabletInfo, ok := tabletMap[*masterElectTabletAlias] if !ok { return fmt.Errorf("master-elect tablet %v is not in the shard", topoproto.TabletAliasString(masterElectTabletAlias)) } ev.NewMaster = *masterElectTabletInfo.Tablet // Check the master is the only master is the shard, or -force was used. _, masterTabletMap := topotools.SortedTabletMap(tabletMap) if !topoproto.TabletAliasEqual(shardInfo.MasterAlias, masterElectTabletAlias) { if !force { return fmt.Errorf("master-elect tablet %v is not the shard master, use -force to proceed anyway", topoproto.TabletAliasString(masterElectTabletAlias)) } wr.logger.Warningf("master-elect tablet %v is not the shard master, proceeding anyway as -force was used", topoproto.TabletAliasString(masterElectTabletAlias)) } if _, ok := masterTabletMap[*masterElectTabletAlias]; !ok { if !force { return fmt.Errorf("master-elect tablet %v is not a master in the shard, use -force to proceed anyway", topoproto.TabletAliasString(masterElectTabletAlias)) } wr.logger.Warningf("master-elect tablet %v is not a master in the shard, proceeding anyway as -force was used", topoproto.TabletAliasString(masterElectTabletAlias)) } haveOtherMaster := false for alias := range masterTabletMap { if !topoproto.TabletAliasEqual(&alias, masterElectTabletAlias) { haveOtherMaster = true } } if haveOtherMaster { if !force { return fmt.Errorf("master-elect tablet %v is not the only master in the shard, use -force to proceed anyway", topoproto.TabletAliasString(masterElectTabletAlias)) } wr.logger.Warningf("master-elect tablet %v is not the only master in the shard, proceeding anyway as -force was used", topoproto.TabletAliasString(masterElectTabletAlias)) } // First phase: reset replication on all tablets. If anyone fails, // we stop. It is probably because it is unreachable, and may leave // an unstable database process in the mix, with a database daemon // at a wrong replication spot. event.DispatchUpdate(ev, "resetting replication on all tablets") wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for alias, tabletInfo := range tabletMap { wg.Add(1) go func(alias topodatapb.TabletAlias, tabletInfo *topo.TabletInfo) { defer wg.Done() wr.logger.Infof("resetting replication on tablet %v", topoproto.TabletAliasString(&alias)) if err := wr.TabletManagerClient().ResetReplication(ctx, tabletInfo); err != nil { rec.RecordError(fmt.Errorf("Tablet %v ResetReplication failed (either fix it, or Scrap it): %v", topoproto.TabletAliasString(&alias), err)) } }(alias, tabletInfo) } wg.Wait() if err := rec.Error(); err != nil { return err } // Tell the new master to break its slaves, return its replication // position wr.logger.Infof("initializing master on %v", topoproto.TabletAliasString(masterElectTabletAlias)) event.DispatchUpdate(ev, "initializing master") rp, err := wr.TabletManagerClient().InitMaster(ctx, masterElectTabletInfo) if err != nil { return err } // Now tell the new master to insert the reparent_journal row, // and tell everybody else to become a slave of the new master, // and wait for the row in the reparent_journal table. // We start all these in parallel, to handle the semi-sync // case: for the master to be able to commit its row in the // reparent_journal table, it needs connected slaves. event.DispatchUpdate(ev, "reparenting all tablets") now := time.Now().UnixNano() wgMaster := sync.WaitGroup{} wgSlaves := sync.WaitGroup{} var masterErr error for alias, tabletInfo := range tabletMap { if topoproto.TabletAliasEqual(&alias, masterElectTabletAlias) { wgMaster.Add(1) go func(alias topodatapb.TabletAlias, tabletInfo *topo.TabletInfo) { defer wgMaster.Done() wr.logger.Infof("populating reparent journal on new master %v", topoproto.TabletAliasString(&alias)) masterErr = wr.TabletManagerClient().PopulateReparentJournal(ctx, tabletInfo, now, initShardMasterOperation, &alias, rp) }(alias, tabletInfo) } else { wgSlaves.Add(1) go func(alias topodatapb.TabletAlias, tabletInfo *topo.TabletInfo) { defer wgSlaves.Done() wr.logger.Infof("initializing slave %v", topoproto.TabletAliasString(&alias)) if err := wr.TabletManagerClient().InitSlave(ctx, tabletInfo, masterElectTabletAlias, rp, now); err != nil { rec.RecordError(fmt.Errorf("Tablet %v InitSlave failed: %v", topoproto.TabletAliasString(&alias), err)) } }(alias, tabletInfo) } } // After the master is done, we can update the shard record // (note with semi-sync, it also means at least one slave is done) wgMaster.Wait() if masterErr != nil { wgSlaves.Wait() return fmt.Errorf("failed to PopulateReparentJournal on master: %v", masterErr) } if !topoproto.TabletAliasEqual(shardInfo.MasterAlias, masterElectTabletAlias) { if _, err := wr.ts.UpdateShardFields(ctx, keyspace, shard, func(s *topodatapb.Shard) error { s.MasterAlias = masterElectTabletAlias return nil }); err != nil { wgSlaves.Wait() return fmt.Errorf("failed to update shard master record: %v", err) } } // Wait for the slaves to complete. If some of them fail, we // don't want to rebuild the shard serving graph (the failure // will most likely be a timeout, and our context will be // expired, so the rebuild will fail anyway) wgSlaves.Wait() if err := rec.Error(); err != nil { return err } // Create database if necessary on the master. Slaves will get it too through // replication. Since the user called InitShardMaster, they've told us to // assume that whatever data is on all the slaves is what they intended. // If the database doesn't exist, it means the user intends for these tablets // to begin serving with no data (i.e. first time initialization). createDB := fmt.Sprintf("CREATE DATABASE IF NOT EXISTS `%s`", topoproto.TabletDbName(masterElectTabletInfo.Tablet)) if _, err := wr.TabletManagerClient().ExecuteFetchAsDba(ctx, masterElectTabletInfo, createDB, 1, false, true); err != nil { return fmt.Errorf("failed to create database: %v", err) } // Then we rebuild the entire serving graph for the shard, // to account for all changes. event.DispatchUpdate(ev, "rebuilding shard graph") _, err = wr.RebuildShardGraph(ctx, keyspace, shard, nil) return err }
// generateChunks returns an array of chunks to use for splitting up a table // into multiple data chunks. It only works for tables with a primary key // whose first column is a numeric type. func generateChunks(ctx context.Context, wr *wrangler.Wrangler, tablet *topodatapb.Tablet, td *tabletmanagerdatapb.TableDefinition, chunkCount, minRowsPerChunk int) ([]chunk, error) { if len(td.PrimaryKeyColumns) == 0 { // No explicit primary key. Cannot chunk the rows then. wr.Logger().Infof("table=%v: Not splitting the table into multiple chunks because it has no primary key columns. This will reduce the performance of the clone.", td.Name) return singleCompleteChunk, nil } if td.RowCount < 2*uint64(minRowsPerChunk) { // The automatic adjustment of "chunkCount" based on "minRowsPerChunk" // below would set "chunkCount" to less than 2 i.e. 1 or 0 chunks. // In practice in this case there should be exactly one chunk. // Return early in this case and notice the user about this. wr.Logger().Infof("table=%v: Not splitting the table into multiple chunks because it has only %d rows.", td.Name, td.RowCount) return singleCompleteChunk, nil } if chunkCount == 1 { return singleCompleteChunk, nil } // Get the MIN and MAX of the leading column of the primary key. query := fmt.Sprintf("SELECT MIN(%v), MAX(%v) FROM %v.%v", escape(td.PrimaryKeyColumns[0]), escape(td.PrimaryKeyColumns[0]), escape(topoproto.TabletDbName(tablet)), escape(td.Name)) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) qr, err := wr.TabletManagerClient().ExecuteFetchAsApp(shortCtx, tablet, true, []byte(query), 1) cancel() if err != nil { return nil, fmt.Errorf("Cannot determine MIN and MAX of the first primary key column. ExecuteFetchAsApp: %v", err) } if len(qr.Rows) != 1 { return nil, fmt.Errorf("Cannot determine MIN and MAX of the first primary key column. Zero rows were returned for the following query: %v", query) } result := sqltypes.Proto3ToResult(qr) min := result.Rows[0][0].ToNative() max := result.Rows[0][1].ToNative() if min == nil || max == nil { wr.Logger().Infof("table=%v: Not splitting the table into multiple chunks, min or max is NULL: %v", td.Name, qr.Rows[0]) return singleCompleteChunk, nil } // Determine the average number of rows per chunk for the given chunkCount. avgRowsPerChunk := td.RowCount / uint64(chunkCount) if avgRowsPerChunk < uint64(minRowsPerChunk) { // Reduce the chunkCount to fulfill minRowsPerChunk. newChunkCount := td.RowCount / uint64(minRowsPerChunk) wr.Logger().Infof("table=%v: Reducing the number of chunks from the default %d to %d to make sure that each chunk has at least %d rows.", td.Name, chunkCount, newChunkCount, minRowsPerChunk) chunkCount = int(newChunkCount) } // TODO(mberlin): Write a unit test for this part of the function. var interval interface{} chunks := make([]chunk, chunkCount) switch min := min.(type) { case int64: max := max.(int64) interval = (max - min) / int64(chunkCount) if interval == 0 { wr.Logger().Infof("table=%v: Not splitting the table into multiple chunks, interval=0: %v to %v", td.Name, min, max) return singleCompleteChunk, nil } case uint64: max := max.(uint64) interval = (max - min) / uint64(chunkCount) if interval == 0 { wr.Logger().Infof("table=%v: Not splitting the table into multiple chunks, interval=0: %v to %v", td.Name, min, max) return singleCompleteChunk, nil } case float64: max := max.(float64) interval = (max - min) / float64(chunkCount) if interval == 0 { wr.Logger().Infof("table=%v: Not splitting the table into multiple chunks, interval=0: %v to %v", td.Name, min, max) return singleCompleteChunk, nil } default: wr.Logger().Infof("table=%v: Not splitting the table into multiple chunks, primary key not numeric.", td.Name) return singleCompleteChunk, nil } // Create chunks. start := min for i := 0; i < chunkCount; i++ { end := add(start, interval) chunk, err := toChunk(start, end, i+1, chunkCount) if err != nil { return nil, err } chunks[i] = chunk start = end } // Clear out the MIN and MAX on the first and last chunk respectively // because other shards might have smaller or higher values than the one we // looked at. chunks[0].start = sqltypes.NULL chunks[chunkCount-1].end = sqltypes.NULL return chunks, nil }
// Start validates and updates the topology records for the tablet, and performs // the initial state change callback to start tablet services. // If initUpdateStream is set, update stream service will also be registered. func (agent *ActionAgent) Start(ctx context.Context, mysqlPort, vtPort, gRPCPort int32, initUpdateStream bool) error { var err error if _, err = agent.updateTabletFromTopo(ctx); err != nil { return err } // find our hostname as fully qualified, and IP hostname := *tabletHostname if hostname == "" { hostname, err = netutil.FullyQualifiedHostname() if err != nil { return err } } ipAddrs, err := net.LookupHost(hostname) if err != nil { return err } ipAddr := ipAddrs[0] // Update bind addr for mysql and query service in the tablet node. f := func(tablet *topodatapb.Tablet) error { tablet.Hostname = hostname tablet.Ip = ipAddr if tablet.PortMap == nil { tablet.PortMap = make(map[string]int32) } if mysqlPort != 0 { // only overwrite mysql port if we know it, otherwise // leave it as is. tablet.PortMap["mysql"] = mysqlPort } if vtPort != 0 { tablet.PortMap["vt"] = vtPort } else { delete(tablet.PortMap, "vt") } delete(tablet.PortMap, "vts") if gRPCPort != 0 { tablet.PortMap["grpc"] = gRPCPort } else { delete(tablet.PortMap, "grpc") } return nil } if _, err := agent.TopoServer.UpdateTabletFields(ctx, agent.Tablet().Alias, f); err != nil { return err } // Reread to get the changes we just made tablet, err := agent.updateTabletFromTopo(ctx) if err != nil { return err } // Save the original tablet record as it is now (at startup). agent.initialTablet = proto.Clone(tablet).(*topodatapb.Tablet) if err = agent.verifyTopology(ctx); err != nil { return err } // get and fix the dbname if necessary if !agent.DBConfigs.IsZero() { // Only for real instances // Update our DB config to match the info we have in the tablet if agent.DBConfigs.App.DbName == "" { agent.DBConfigs.App.DbName = topoproto.TabletDbName(tablet) } if agent.DBConfigs.Filtered.DbName == "" { agent.DBConfigs.Filtered.DbName = topoproto.TabletDbName(tablet) } agent.DBConfigs.App.Keyspace = tablet.Keyspace agent.DBConfigs.App.Shard = tablet.Shard } // create and register the RPC services from UpdateStream // (it needs the dbname, so it has to be delayed up to here, // but it has to be before updateState below that may use it) if initUpdateStream { us := binlog.NewUpdateStream(agent.MysqlDaemon, agent.DBConfigs.App.DbName) agent.UpdateStream = us servenv.OnRun(func() { us.RegisterService() }) } servenv.OnTerm(func() { // Disable UpdateStream (if any) upon entering lameduck. // We do this regardless of initUpdateStream, since agent.UpdateStream // may have been set from elsewhere. if agent.UpdateStream != nil { agent.UpdateStream.Disable() } }) // initialize tablet server if err := agent.QueryServiceControl.InitDBConfig(querypb.Target{ Keyspace: tablet.Keyspace, Shard: tablet.Shard, TabletType: tablet.Type, }, agent.DBConfigs, agent.MysqlDaemon); err != nil { return fmt.Errorf("failed to InitDBConfig: %v", err) } // export a few static variables if agent.exportStats { statsKeyspace := stats.NewString("TabletKeyspace") statsShard := stats.NewString("TabletShard") statsKeyRangeStart := stats.NewString("TabletKeyRangeStart") statsKeyRangeEnd := stats.NewString("TabletKeyRangeEnd") statsKeyspace.Set(tablet.Keyspace) statsShard.Set(tablet.Shard) if key.KeyRangeIsPartial(tablet.KeyRange) { statsKeyRangeStart.Set(hex.EncodeToString(tablet.KeyRange.Start)) statsKeyRangeEnd.Set(hex.EncodeToString(tablet.KeyRange.End)) } } // initialize the key range query rule if err := agent.initializeKeyRangeRule(ctx, tablet.Keyspace, tablet.KeyRange); err != nil { return err } // update our state oldTablet := &topodatapb.Tablet{} agent.updateState(ctx, oldTablet, "Start") // run a background task to rebuild the SrvKeyspace in our cell/keyspace // if it doesn't exist yet go agent.maybeRebuildKeyspace(tablet.Alias.Cell, tablet.Keyspace) return nil }