func main() { defer exit.Recover() flag.Parse() servenv.Init() if initFakeZK != nil { initFakeZK() } ts := topo.GetServer() defer topo.CloseServers() resilientSrvTopoServer = vtgate.NewResilientSrvTopoServer(ts, "ResilientSrvTopoServer") healthCheck = discovery.NewHealthCheck(*connTimeoutTotal, *healthCheckRetryDelay, *healthCheckTimeout, "" /* statsSuffix */) tabletTypes := make([]topodatapb.TabletType, 0, 1) if len(*tabletTypesToWait) != 0 { for _, ttStr := range strings.Split(*tabletTypesToWait, ",") { tt, err := topoproto.ParseTabletType(ttStr) if err != nil { log.Errorf("unknown tablet type: %v", ttStr) continue } tabletTypes = append(tabletTypes, tt) } } vtg := vtgate.Init(context.Background(), healthCheck, ts, resilientSrvTopoServer, *cell, *retryDelay, *retryCount, *connTimeoutTotal, *connTimeoutPerConn, *connLife, tabletTypes, *maxInFlight, *testGateway) servenv.OnRun(func() { addStatusParts(vtg) }) servenv.RunDefault() }
// startHealthWatchers launches the topology watchers and health checking to monitor // all tablets on the shard. Function should be called before the start of the schema // swap process. func (shardSwap *shardSchemaSwap) startHealthWatchers() error { shardSwap.tabletHealthCheck = discovery.NewHealthCheck( *vtctl.HealthCheckTopologyRefresh, *vtctl.HealthcheckRetryDelay, *vtctl.HealthCheckTimeout) shardSwap.tabletHealthCheck.SetListener(shardSwap, true /* sendDownEvents */) topoServer := shardSwap.parent.topoServer cellList, err := topoServer.GetKnownCells(shardSwap.parent.ctx) if err != nil { return err } for _, cell := range cellList { watcher := discovery.NewShardReplicationWatcher( topoServer, shardSwap.tabletHealthCheck, cell, shardSwap.parent.keyspace, shardSwap.shardName, *vtctl.HealthCheckTimeout, discovery.DefaultTopoReadConcurrency) shardSwap.tabletWatchers = append(shardSwap.tabletWatchers, watcher) } for _, watcher := range shardSwap.tabletWatchers { if err := watcher.WaitForInitialTopology(); err != nil { return err } } shardSwap.tabletHealthCheck.WaitForInitialStatsUpdates() return nil }
func main() { defer exit.Recover() flag.Parse() servenv.Init() ts := topo.GetServer() defer topo.CloseServers() resilientSrvTopoServer = vtgate.NewResilientSrvTopoServer(ts, "ResilientSrvTopoServer") healthCheck = discovery.NewHealthCheck(*healthCheckConnTimeout, *healthCheckRetryDelay, *healthCheckTimeout) healthCheck.RegisterStats() tabletTypes := make([]topodatapb.TabletType, 0, 1) if len(*tabletTypesToWait) != 0 { for _, ttStr := range strings.Split(*tabletTypesToWait, ",") { tt, err := topoproto.ParseTabletType(ttStr) if err != nil { log.Errorf("unknown tablet type: %v", ttStr) continue } tabletTypes = append(tabletTypes, tt) } } l2vtg := l2vtgate.Init(healthCheck, ts, resilientSrvTopoServer, *cell, *retryCount, tabletTypes) servenv.OnRun(func() { addStatusParts(l2vtg) }) servenv.RunDefault() }
func newRealtimeStats(ts topo.Server) (*realtimeStats, error) { hc := discovery.NewHealthCheck(*vtctl.HealthCheckTimeout, *vtctl.HealthcheckRetryDelay, *vtctl.HealthCheckTimeout) tabletStatsCache := &tabletStatsCache{ statuses: make(map[string]map[string]*discovery.TabletStats), } hc.SetListener(tabletStatsCache) r := &realtimeStats{ healthCheck: hc, tabletStats: tabletStatsCache, } // Get the list of all tablets from all cells and monitor the topology for added or removed tablets with a CellTabletsWatcher. cells, err := ts.GetKnownCells(context.Background()) if err != nil { return r, fmt.Errorf("error when getting cells: %v", err) } var watchers []*discovery.TopologyWatcher for _, cell := range cells { watcher := discovery.NewCellTabletsWatcher(ts, hc, cell, *vtctl.HealthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) watchers = append(watchers, watcher) } r.cellWatchers = watchers return r, nil }
// FindHealthyRdonlyEndPoint returns a random healthy endpoint. // Since we don't want to use them all, we require at least // minHealthyEndPoints servers to be healthy. // May block up to -wait_for_healthy_rdonly_endpoints_timeout. func FindHealthyRdonlyEndPoint(ctx context.Context, wr *wrangler.Wrangler, cell, keyspace, shard string) (*topodatapb.TabletAlias, error) { busywaitCtx, busywaitCancel := context.WithTimeout(ctx, *WaitForHealthyEndPointsTimeout) defer busywaitCancel() // create a discovery healthcheck, wait for it to have one rdonly // endpoints at this point healthCheck := discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout, "" /* statsSuffix */) watcher := discovery.NewShardReplicationWatcher(wr.TopoServer(), healthCheck, cell, keyspace, shard, *healthCheckTopologyRefresh, 5 /*topoReadConcurrency*/) defer watcher.Stop() defer healthCheck.Close() if err := discovery.WaitForEndPoints(ctx, healthCheck, cell, keyspace, shard, []topodatapb.TabletType{topodatapb.TabletType_RDONLY}); err != nil { return nil, fmt.Errorf("error waiting for rdonly endpoints for (%v,%v/%v): %v", cell, keyspace, shard, err) } var healthyEndpoints []*topodatapb.EndPoint for { select { case <-busywaitCtx.Done(): return nil, fmt.Errorf("Not enough endpoints to choose from in (%v,%v/%v), have %v healthy ones, need at least %v Context Error: %v", cell, keyspace, shard, len(healthyEndpoints), *minHealthyEndPoints, busywaitCtx.Err()) default: } addrs := healthCheck.GetEndPointStatsFromTarget(keyspace, shard, topodatapb.TabletType_RDONLY) healthyEndpoints = make([]*topodatapb.EndPoint, 0, len(addrs)) for _, addr := range addrs { // Note we do not check the 'Serving' flag here. // This is mainly to avoid the case where we run a // Diff between a source and destination, and the source // is not serving (disabled by TabletControl). // When we switch the tablet to 'worker', it will // go back to serving state. if addr.Stats == nil || addr.Stats.HealthError != "" || addr.Stats.SecondsBehindMaster > 30 { continue } healthyEndpoints = append(healthyEndpoints, addr.EndPoint) } if len(healthyEndpoints) >= *minHealthyEndPoints { break } deadlineForLog, _ := busywaitCtx.Deadline() wr.Logger().Infof("Waiting for enough endpoints to become available. available: %v required: %v Waiting up to %.1f more seconds.", len(healthyEndpoints), *minHealthyEndPoints, deadlineForLog.Sub(time.Now()).Seconds()) // Block for 1 second because 2 seconds is the -health_check_interval flag value in integration tests. timer := time.NewTimer(1 * time.Second) select { case <-busywaitCtx.Done(): timer.Stop() case <-timer.C: } } // random server in the list is what we want index := rand.Intn(len(healthyEndpoints)) return &topodatapb.TabletAlias{ Cell: cell, Uid: healthyEndpoints[index].Uid, }, nil }
func main() { defer exit.Recover() flag.Parse() servenv.Init() if initFakeZK != nil { initFakeZK() } ts := topo.GetServer() defer topo.CloseServers() var schema *planbuilder.Schema if *schemaFile != "" { var err error if schema, err = planbuilder.LoadFile(*schemaFile); err != nil { log.Error(err) exit.Return(1) } log.Infof("v3 is enabled: loaded schema from file: %v", *schemaFile) } else { ctx := context.Background() schemaJSON, err := ts.GetVSchema(ctx) if err != nil { log.Warningf("Skipping v3 initialization: GetVSchema failed: %v", err) goto startServer } schema, err = planbuilder.NewSchema([]byte(schemaJSON)) if err != nil { log.Warningf("Skipping v3 initialization: GetVSchema failed: %v", err) goto startServer } log.Infof("v3 is enabled: loaded schema from topo") } startServer: resilientSrvTopoServer = vtgate.NewResilientSrvTopoServer(ts, "ResilientSrvTopoServer") healthCheck = discovery.NewHealthCheck(*connTimeoutTotal, *healthCheckRetryDelay, *healthCheckTimeout, "" /* statsSuffix */) tabletTypes := make([]topodatapb.TabletType, 0, 1) if len(*tabletTypesToWait) != 0 { for _, ttStr := range strings.Split(*tabletTypesToWait, ",") { tt, err := topoproto.ParseTabletType(ttStr) if err != nil { log.Errorf("unknown tablet type: %v", ttStr) continue } tabletTypes = append(tabletTypes, tt) } } vtg := vtgate.Init(healthCheck, ts, resilientSrvTopoServer, schema, *cell, *retryDelay, *retryCount, *connTimeoutTotal, *connTimeoutPerConn, *connLife, tabletTypes, *maxInFlight, *testGateway) servenv.OnRun(func() { addStatusParts(vtg) }) servenv.RunDefault() }
func main() { defer exit.Recover() // flag parsing flags := dbconfigs.AppConfig | dbconfigs.DbaConfig | dbconfigs.FilteredConfig | dbconfigs.ReplConfig dbconfigs.RegisterFlags(flags) mysqlctl.RegisterFlags() flag.Parse() if len(flag.Args()) > 0 { flag.Usage() log.Errorf("vtcombo doesn't take any positional arguments") exit.Return(1) } // register topo server topo.RegisterServer("fakezk", zktopo.NewServer(fakezk.NewConn())) ts := topo.GetServerByName("fakezk") servenv.Init() // database configs mycnf, err := mysqlctl.NewMycnfFromFlags(0) if err != nil { log.Errorf("mycnf read failed: %v", err) exit.Return(1) } dbcfgs, err := dbconfigs.Init(mycnf.SocketFile, flags) if err != nil { log.Warning(err) } mysqld := mysqlctl.NewMysqld("Dba", "App", mycnf, &dbcfgs.Dba, &dbcfgs.App.ConnParams, &dbcfgs.Repl) // tablets configuration and init binlog.RegisterUpdateStreamService(mycnf) initTabletMap(ts, *topology, mysqld, dbcfgs, mycnf) // vtgate configuration and init resilientSrvTopoServer := vtgate.NewResilientSrvTopoServer(ts, "ResilientSrvTopoServer") healthCheck := discovery.NewHealthCheck(30*time.Second /*connTimeoutTotal*/, 1*time.Millisecond /*retryDelay*/) vtgate.Init(healthCheck, ts, resilientSrvTopoServer, nil /*schema*/, cell, 1*time.Millisecond /*retryDelay*/, 2 /*retryCount*/, 30*time.Second /*connTimeoutTotal*/, 10*time.Second /*connTimeoutPerConn*/, 365*24*time.Hour /*connLife*/, 0 /*maxInFlight*/, "" /*testGateway*/) servenv.OnTerm(func() { // FIXME(alainjobart) stop vtgate, all tablets // qsc.DisallowQueries() // agent.Stop() }) servenv.OnClose(func() { // We will still use the topo server during lameduck period // to update our state, so closing it in OnClose() topo.CloseServers() }) servenv.RunDefault() }
// TestGRPCDiscovery tests the discovery gateway with a gRPC // connection from the gateway to the fake tablet. func TestGRPCDiscovery(t *testing.T) { flag.Set("tablet_protocol", "grpc") flag.Set("gateway_implementation", "discoverygateway") // Fake services for the tablet, topo server. service, ts, cell := CreateFakeServers(t) // Tablet: listen on a random port. listener, err := net.Listen("tcp", ":0") if err != nil { t.Fatalf("Cannot listen: %v", err) } host := listener.Addr().(*net.TCPAddr).IP.String() port := listener.Addr().(*net.TCPAddr).Port defer listener.Close() // Tablet: create a gRPC server and listen on the port. server := grpc.NewServer() grpcqueryservice.Register(server, service) go server.Serve(listener) defer server.Stop() // VTGate: create the discovery healthcheck, and the gateway. // Wait for the right tablets to be present. hc := discovery.NewHealthCheck(30*time.Second, 10*time.Second, 2*time.Minute) dg := gateway.GetCreator()(hc, ts, ts, cell, 2) hc.AddTablet(&topodatapb.Tablet{ Alias: &topodatapb.TabletAlias{ Cell: cell, Uid: 43, }, Keyspace: tabletconntest.TestTarget.Keyspace, Shard: tabletconntest.TestTarget.Shard, Type: tabletconntest.TestTarget.TabletType, Hostname: host, PortMap: map[string]int32{ "grpc": int32(port), }, }, "test_tablet") err = gateway.WaitForTablets(dg, []topodatapb.TabletType{tabletconntest.TestTarget.TabletType}) if err != nil { t.Fatalf("WaitForTablets failed: %v", err) } defer dg.Close(context.Background()) // run the test suite. TestSuite(t, "discovery-grpc", dg, service) // run it again with vtgate combining Begin and Execute flag.Set("tablet_grpc_combine_begin_execute", "true") TestSuite(t, "discovery-grpc-combo", dg, service) }
// init phase: // - read the destination keyspace, make sure it has 'servedFrom' values func (scw *SplitCloneWorker) init(ctx context.Context) error { scw.setState(WorkerStateInit) // read the keyspace and validate it shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) var err error scw.destinationKeyspaceInfo, err = scw.wr.TopoServer().GetKeyspace(shortCtx, scw.destinationKeyspace) cancel() if err != nil { return fmt.Errorf("cannot read (destination) keyspace %v: %v", scw.destinationKeyspace, err) } // Set source and destination shard infos. switch scw.cloneType { case horizontalResharding: if err := scw.initShardsForHorizontalResharding(ctx); err != nil { return err } case verticalSplit: if err := scw.initShardsForVerticalSplit(ctx); err != nil { return err } } if err := scw.sanityCheckShardInfos(); err != nil { return err } if scw.cloneType == horizontalResharding { if err := scw.loadVSchema(ctx); err != nil { return err } } // Initialize healthcheck and add destination shards to it. scw.healthCheck = discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout) scw.tsc = discovery.NewTabletStatsCacheDoNotSetListener(scw.cell) // We set sendDownEvents=true because it's required by TabletStatsCache. scw.healthCheck.SetListener(scw, true /* sendDownEvents */) // Start watchers to get tablets added automatically to healthCheck. allShards := append(scw.sourceShards, scw.destinationShards...) for _, si := range allShards { watcher := discovery.NewShardReplicationWatcher(scw.wr.TopoServer(), scw.healthCheck, scw.cell, si.Keyspace(), si.ShardName(), *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) scw.shardWatchers = append(scw.shardWatchers, watcher) } return nil }
func main() { defer exit.Recover() flag.Parse() servenv.Init() if initFakeZK != nil { initFakeZK() } ts := topo.GetServer() defer topo.CloseServers() var schema *planbuilder.Schema if *schemaFile != "" { var err error if schema, err = planbuilder.LoadFile(*schemaFile); err != nil { log.Error(err) exit.Return(1) } log.Infof("v3 is enabled: loaded schema from file: %v", *schemaFile) } else { ctx := context.Background() schemaJSON, err := ts.GetVSchema(ctx) if err != nil { log.Warningf("Skipping v3 initialization: GetVSchema failed: %v", err) goto startServer } schema, err = planbuilder.NewSchema([]byte(schemaJSON)) if err != nil { log.Warningf("Skipping v3 initialization: GetVSchema failed: %v", err) goto startServer } log.Infof("v3 is enabled: loaded schema from topo") } startServer: resilientSrvTopoServer = vtgate.NewResilientSrvTopoServer(ts, "ResilientSrvTopoServer") // For the initial phase vtgate is exposing // topoReader api. This will be subsumed by // vtgate once vtgate's client functions become active. topoReader = NewTopoReader(resilientSrvTopoServer) servenv.Register("toporeader", topoReader) healthCheck = discovery.NewHealthCheck(*connTimeoutTotal, *healthCheckRetryDelay) vtgate.Init(healthCheck, ts, resilientSrvTopoServer, schema, *cell, *retryDelay, *retryCount, *connTimeoutTotal, *connTimeoutPerConn, *connLife, *maxInFlight, *testGateway) servenv.RunDefault() }
func newBinlogPlayerController(ts topo.Server, vtClientFactory func() binlogplayer.VtClient, mysqld mysqlctl.MysqlDaemon, cell string, keyRange *topodatapb.KeyRange, sourceShard *topodatapb.Shard_SourceShard, dbName string) *BinlogPlayerController { blc := &BinlogPlayerController{ ts: ts, vtClientFactory: vtClientFactory, mysqld: mysqld, cell: cell, keyRange: keyRange, dbName: dbName, sourceShard: sourceShard, binlogPlayerStats: binlogplayer.NewStats(), healthCheck: discovery.NewHealthCheck(*binlogplayer.BinlogPlayerConnTimeout, *retryDelay, *healthCheckTimeout), } blc.shardReplicationWatcher = discovery.NewShardReplicationWatcher(ts, blc.healthCheck, cell, sourceShard.Keyspace, sourceShard.Shard, *healthCheckTopologyRefresh, 5) return blc }
func newClient(master *master, replica *replica) *client { t, err := throttler.NewThrottler("client", "TPS", 1, throttler.MaxRateModuleDisabled, 5 /* seconds */) if err != nil { log.Fatal(err) } healthCheck := discovery.NewHealthCheck(1*time.Minute, 5*time.Second, 1*time.Minute) c := &client{ master: master, healthCheck: healthCheck, throttler: t, stopChan: make(chan struct{}), } c.healthCheck.SetListener(c, false /* sendDownEvents */) c.healthCheck.AddTablet(replica.fakeTablet.Tablet, "name") return c }
func newBinlogPlayerController(ts topo.Server, vtClientFactory func() binlogplayer.VtClient, mysqld mysqlctl.MysqlDaemon, cell string, keyspaceIDType pb.KeyspaceIdType, keyRange *pb.KeyRange, sourceShard *pb.Shard_SourceShard, dbName string) *BinlogPlayerController { blc := &BinlogPlayerController{ ts: ts, vtClientFactory: vtClientFactory, mysqld: mysqld, cell: cell, keyspaceIDType: keyspaceIDType, keyRange: keyRange, dbName: dbName, sourceShard: sourceShard, binlogPlayerStats: binlogplayer.NewBinlogPlayerStats(), healthCheck: discovery.NewHealthCheck(*binlogplayer.BinlogPlayerConnTimeout, *retryDelay), initialEndpointFound: make(chan struct{}), } blc.healthCheck.SetListener(blc) blc.shardReplicationWatcher = discovery.NewShardReplicationWatcher(ts, blc.healthCheck, cell, sourceShard.Keyspace, sourceShard.Shard, *healthcheckTopologyRefresh, 5) return blc }
// FindHealthyRdonlyTablet returns a random healthy RDONLY tablet. // Since we don't want to use them all, we require at least // minHealthyRdonlyTablets servers to be healthy. // May block up to -wait_for_healthy_rdonly_tablets_timeout. func FindHealthyRdonlyTablet(ctx context.Context, wr *wrangler.Wrangler, healthCheck discovery.HealthCheck, cell, keyspace, shard string, minHealthyRdonlyTablets int) (*topodatapb.TabletAlias, error) { if healthCheck == nil { // No healthcheck instance provided. Create one. healthCheck = discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout) watcher := discovery.NewShardReplicationWatcher(wr.TopoServer(), healthCheck, cell, keyspace, shard, *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) defer watcher.Stop() defer healthCheck.Close() } healthyTablets, err := waitForHealthyRdonlyTablets(ctx, wr, healthCheck, cell, keyspace, shard, minHealthyRdonlyTablets, *waitForHealthyTabletsTimeout) if err != nil { return nil, err } // random server in the list is what we want index := rand.Intn(len(healthyTablets)) return healthyTablets[index].Tablet.Alias, nil }
// newBinlogPlayerController instantiates a new BinlogPlayerController. // Use Start() and Stop() to start and stop it. // Once stopped, you should call Close() to stop and free resources e.g. the // healthcheck instance. func newBinlogPlayerController(ts topo.Server, vtClientFactory func() binlogplayer.VtClient, mysqld mysqlctl.MysqlDaemon, cell string, keyRange *topodatapb.KeyRange, sourceShard *topodatapb.Shard_SourceShard, dbName string) *BinlogPlayerController { healthCheck := discovery.NewHealthCheck(*binlogplayer.BinlogPlayerConnTimeout, *healthcheckRetryDelay, *healthCheckTimeout) return &BinlogPlayerController{ ts: ts, vtClientFactory: vtClientFactory, mysqld: mysqld, cell: cell, keyRange: keyRange, dbName: dbName, sourceShard: sourceShard, binlogPlayerStats: binlogplayer.NewStats(), // Note: healthCheck and shardReplicationWatcher remain active independent // of whether the BinlogPlayerController is Start()'d or Stop()'d. // Use Close() after Stop() to finally close them and free their resources. healthCheck: healthCheck, shardReplicationWatcher: discovery.NewShardReplicationWatcher(ts, healthCheck, cell, sourceShard.Keyspace, sourceShard.Shard, *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency), } }
func TestGRPCDiscovery(t *testing.T) { flag.Set("tablet_protocol", "grpc") flag.Set("gateway_implementation", "discoverygateway") // Fake services for the tablet, topo server. service, ts, cell := CreateFakeServers(t) // Listen on a random port. listener, err := net.Listen("tcp", ":0") if err != nil { t.Fatalf("Cannot listen: %v", err) } host := listener.Addr().(*net.TCPAddr).IP.String() port := listener.Addr().(*net.TCPAddr).Port // Create a gRPC server and listen on the port. server := grpc.NewServer() grpcqueryservice.RegisterForTest(server, service) go server.Serve(listener) // Create the discovery healthcheck, and the gateway. // Wait for the right tablets to be present. hc := discovery.NewHealthCheck(30*time.Second, 10*time.Second, 2*time.Minute) hc.AddTablet(cell, "test_tablet", &topodatapb.Tablet{ Alias: &topodatapb.TabletAlias{ Cell: cell, }, Keyspace: tabletconntest.TestTarget.Keyspace, Shard: tabletconntest.TestTarget.Shard, Type: tabletconntest.TestTarget.TabletType, Hostname: host, PortMap: map[string]int32{ "grpc": int32(port), }, }) dg := gateway.GetCreator()(hc, ts, ts, cell, 2, []topodatapb.TabletType{tabletconntest.TestTarget.TabletType}) // and run the test suite. TestSuite(t, "discovery-grpc", dg, service) }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'worker' pointing back to us // - get the aliases of all the targets func (vscw *VerticalSplitCloneWorker) findTargets(ctx context.Context) error { vscw.setState(WorkerStateFindTargets) // find an appropriate tablet in the source shard var err error vscw.sourceAlias, err = FindWorkerTablet(ctx, vscw.wr, vscw.cleaner, nil /* tsc */, vscw.cell, vscw.sourceKeyspace, "0", vscw.minHealthyRdonlyTablets) if err != nil { return fmt.Errorf("FindWorkerTablet() failed for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err) } vscw.wr.Logger().Infof("Using tablet %v as the source", topoproto.TabletAliasString(vscw.sourceAlias)) // get the tablet info for it shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) ti, err := vscw.wr.TopoServer().GetTablet(shortCtx, vscw.sourceAlias) cancel() if err != nil { return fmt.Errorf("cannot read tablet %v: %v", topoproto.TabletAliasString(vscw.sourceAlias), err) } vscw.sourceTablet = ti.Tablet // stop replication on it shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = vscw.wr.TabletManagerClient().StopSlave(shortCtx, vscw.sourceTablet) cancel() if err != nil { return fmt.Errorf("cannot stop replication on tablet %v", topoproto.TabletAliasString(vscw.sourceAlias)) } wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet) // Initialize healthcheck and add destination shards to it. vscw.healthCheck = discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout) vscw.tsc = discovery.NewTabletStatsCache(vscw.healthCheck, vscw.cell) watcher := discovery.NewShardReplicationWatcher(vscw.wr.TopoServer(), vscw.healthCheck, vscw.cell, vscw.destinationKeyspace, vscw.destinationShard, *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) vscw.destinationShardWatchers = append(vscw.destinationShardWatchers, watcher) // Make sure we find a master for each destination shard and log it. vscw.wr.Logger().Infof("Finding a MASTER tablet for each destination shard...") waitCtx, waitCancel := context.WithTimeout(ctx, *waitForHealthyTabletsTimeout) defer waitCancel() if err := vscw.tsc.WaitForTablets(waitCtx, vscw.cell, vscw.destinationKeyspace, vscw.destinationShard, []topodatapb.TabletType{topodatapb.TabletType_MASTER}); err != nil { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v (in cell: %v): %v", vscw.destinationKeyspace, vscw.destinationShard, vscw.cell, err) } masters := vscw.tsc.GetHealthyTabletStats(vscw.destinationKeyspace, vscw.destinationShard, topodatapb.TabletType_MASTER) if len(masters) == 0 { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v (in cell: %v) in HealthCheck: empty TabletStats list", vscw.destinationKeyspace, vscw.destinationShard, vscw.cell) } master := masters[0] // Get the MySQL database name of the tablet. keyspaceAndShard := topoproto.KeyspaceShardString(vscw.destinationKeyspace, vscw.destinationShard) vscw.destinationDbNames[keyspaceAndShard] = topoproto.TabletDbName(master.Tablet) // TODO(mberlin): Verify on the destination master that the // _vt.blp_checkpoint table has the latest schema. vscw.wr.Logger().Infof("Using tablet %v as destination master for %v/%v", topoproto.TabletAliasString(master.Tablet.Alias), vscw.destinationKeyspace, vscw.destinationShard) vscw.wr.Logger().Infof("NOTE: The used master of a destination shard might change over the course of the copy e.g. due to a reparent. The HealthCheck module will track and log master changes and any error message will always refer the actually used master address.") return nil }
func (wr *Wrangler) waitForDrainInCell(ctx context.Context, cell, keyspace, shard string, servedType topodatapb.TabletType, retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout time.Duration) error { hc := discovery.NewHealthCheck(healthCheckTimeout /* connectTimeout */, healthcheckRetryDelay, healthCheckTimeout, cell) defer hc.Close() watcher := discovery.NewShardReplicationWatcher(wr.TopoServer(), hc, cell, keyspace, shard, healthCheckTopologyRefresh, 5 /* topoReadConcurrency */) defer watcher.Stop() if err := discovery.WaitForEndPoints(ctx, hc, cell, keyspace, shard, []topodatapb.TabletType{servedType}); err != nil { return fmt.Errorf("%v: error waiting for initial %v endpoints for %v/%v: %v", cell, servedType, keyspace, shard, err) } wr.Logger().Infof("%v: Waiting for %.1f seconds to make sure that the discovery module retrieves healthcheck information from all tablets.", cell, healthCheckTimeout.Seconds()) // Wait at least for -vtctl_healthcheck_timeout to elapse to make sure that we // see all healthy tablets. Otherwise, we might miss some tablets. // It's safe to wait not longer for this because we would only miss slow // tablets and vtgate would not serve from such tablets anyway. time.Sleep(healthCheckTimeout) // Now check the QPS rate of all tablets until the timeout expires. startTime := time.Now() for { healthyTabletsCount := 0 // map key: tablet uid drainedHealthyTablets := make(map[uint32]*discovery.EndPointStats) notDrainedHealtyTablets := make(map[uint32]*discovery.EndPointStats) addrs := hc.GetEndPointStatsFromTarget(keyspace, shard, servedType) healthyTabletsCount = 0 for _, addr := range addrs { // TODO(mberlin): Move this health check logic into a common function // because other code uses it as well e.g. go/vt/worker/topo_utils.go. if addr.Stats == nil || addr.Stats.HealthError != "" || addr.Stats.SecondsBehindMaster > 30 { // not healthy continue } healthyTabletsCount++ if addr.Stats.Qps == 0.0 { drainedHealthyTablets[addr.EndPoint.Uid] = addr } else { notDrainedHealtyTablets[addr.EndPoint.Uid] = addr } } if len(drainedHealthyTablets) == healthyTabletsCount { wr.Logger().Infof("%v: All %d healthy tablets were drained after %.1f seconds (not counting %.1f seconds for the initial wait).", cell, healthyTabletsCount, time.Now().Sub(startTime).Seconds(), healthCheckTimeout.Seconds()) break } // Continue waiting, sleep in between. deadlineString := "" if d, ok := ctx.Deadline(); ok { deadlineString = fmt.Sprintf(" up to %.1f more seconds", d.Sub(time.Now()).Seconds()) } wr.Logger().Infof("%v: Waiting%v for all healthy tablets to be drained (%d/%d done).", cell, deadlineString, len(drainedHealthyTablets), healthyTabletsCount) timer := time.NewTimer(retryDelay) select { case <-ctx.Done(): timer.Stop() var l []string for _, eps := range notDrainedHealtyTablets { l = append(l, formatEndpointStats(eps)) } return fmt.Errorf("%v: WaitForDrain failed for %v tablets in %v/%v. Only %d/%d tablets were drained. err: %v List of tablets which were not drained:\n%v", cell, servedType, keyspace, shard, len(drainedHealthyTablets), healthyTabletsCount, ctx.Err(), strings.Join(l, "\n")) case <-timer.C: } } return nil }
func main() { defer exit.Recover() // flag parsing flags := dbconfigs.AppConfig | dbconfigs.AllPrivsConfig | dbconfigs.DbaConfig | dbconfigs.FilteredConfig | dbconfigs.ReplConfig dbconfigs.RegisterFlags(flags) mysqlctl.RegisterFlags() flag.Parse() if len(flag.Args()) > 0 { flag.Usage() log.Errorf("vtcombo doesn't take any positional arguments") exit.Return(1) } // parse the input topology tpb := &vttestpb.VTTestTopology{} if err := proto.UnmarshalText(*protoTopo, tpb); err != nil { log.Errorf("cannot parse topology: %v", err) exit.Return(1) } // default cell to "test" if unspecified if len(tpb.Cells) == 0 { tpb.Cells = append(tpb.Cells, "test") } // set discoverygateway flag to default value flag.Set("cells_to_watch", strings.Join(tpb.Cells, ",")) // vtctld UI requires the cell flag flag.Set("cell", tpb.Cells[0]) flag.Set("enable_realtime_stats", "true") flag.Set("log_dir", "$VTDATAROOT/tmp") // create zk client config file config := path.Join(os.Getenv("VTDATAROOT"), "vt_0000000001/tmp/test-zk-client-conf.json") cellmap := make(map[string]string) for _, cell := range tpb.Cells { cellmap[cell] = "localhost" } b, err := json.Marshal(cellmap) if err != nil { log.Errorf("failed to marshal json: %v", err) } f, err := os.Create(config) if err != nil { log.Errorf("failed to create zk config file: %v", err) } defer f.Close() _, err = f.WriteString(string(b[:])) if err != nil { log.Errorf("failed to write to zk config file: %v", err) } os.Setenv("ZK_CLIENT_CONFIG", config) // register topo server zkconn := fakezk.NewConn() topo.RegisterServer("fakezk", zktopo.NewServer(zkconn)) ts = topo.GetServerByName("fakezk") servenv.Init() tabletserver.Init() // database configs mycnf, err := mysqlctl.NewMycnfFromFlags(0) if err != nil { log.Errorf("mycnf read failed: %v", err) exit.Return(1) } dbcfgs, err := dbconfigs.Init(mycnf.SocketFile, flags) if err != nil { log.Warning(err) } mysqld := mysqlctl.NewMysqld(mycnf, &dbcfgs.Dba, &dbcfgs.AllPrivs, &dbcfgs.App, &dbcfgs.Repl, true /* enablePublishStats */) servenv.OnClose(mysqld.Close) // tablets configuration and init if err := initTabletMap(ts, tpb, mysqld, dbcfgs, *schemaDir, mycnf); err != nil { log.Errorf("initTabletMapProto failed: %v", err) exit.Return(1) } // vtgate configuration and init resilientSrvTopoServer := vtgate.NewResilientSrvTopoServer(ts, "ResilientSrvTopoServer") healthCheck := discovery.NewHealthCheck(30*time.Second /*connTimeoutTotal*/, 1*time.Millisecond /*retryDelay*/, 1*time.Hour /*healthCheckTimeout*/) tabletTypesToWait := []topodatapb.TabletType{ topodatapb.TabletType_MASTER, topodatapb.TabletType_REPLICA, topodatapb.TabletType_RDONLY, } vtgate.Init(context.Background(), healthCheck, ts, resilientSrvTopoServer, tpb.Cells[0], 2 /*retryCount*/, tabletTypesToWait) // vtctld configuration and init vtctld.InitVtctld(ts) vtctld.HandleExplorer("zk", zktopo.NewZkExplorer(zkconn)) servenv.OnTerm(func() { // FIXME(alainjobart): stop vtgate }) servenv.OnClose(func() { // We will still use the topo server during lameduck period // to update our state, so closing it in OnClose() topo.CloseServers() }) servenv.RunDefault() }
func main() { defer exit.Recover() // flag parsing flags := dbconfigs.AppConfig | dbconfigs.DbaConfig | dbconfigs.FilteredConfig | dbconfigs.ReplConfig dbconfigs.RegisterFlags(flags) mysqlctl.RegisterFlags() flag.Parse() if len(flag.Args()) > 0 { flag.Usage() log.Errorf("vtcombo doesn't take any positional arguments") exit.Return(1) } // register topo server zkconn := fakezk.NewConn() topo.RegisterServer("fakezk", zktopo.NewServer(zkconn)) ts = topo.GetServerByName("fakezk") servenv.Init() tabletserver.Init() // database configs mycnf, err := mysqlctl.NewMycnfFromFlags(0) if err != nil { log.Errorf("mycnf read failed: %v", err) exit.Return(1) } dbcfgs, err := dbconfigs.Init(mycnf.SocketFile, flags) if err != nil { log.Warning(err) } mysqld := mysqlctl.NewMysqld("Dba", "App", mycnf, &dbcfgs.Dba, &dbcfgs.App.ConnParams, &dbcfgs.Repl) servenv.OnClose(mysqld.Close) // tablets configuration and init initTabletMap(ts, *topology, mysqld, dbcfgs, mycnf) // vschema var schema *planbuilder.Schema if *vschema != "" { schema, err = planbuilder.LoadFile(*vschema) if err != nil { log.Error(err) exit.Return(1) } log.Infof("v3 is enabled: loaded schema from file") } // vtgate configuration and init resilientSrvTopoServer := vtgate.NewResilientSrvTopoServer(ts, "ResilientSrvTopoServer") healthCheck := discovery.NewHealthCheck(30*time.Second /*connTimeoutTotal*/, 1*time.Millisecond /*retryDelay*/, 1*time.Minute /*healthCheckTimeout*/) tabletTypesToWait := []topodatapb.TabletType{ topodatapb.TabletType_MASTER, topodatapb.TabletType_REPLICA, topodatapb.TabletType_RDONLY, } vtgate.Init(healthCheck, ts, resilientSrvTopoServer, schema, cell, 1*time.Millisecond /*retryDelay*/, 2 /*retryCount*/, 30*time.Second /*connTimeoutTotal*/, 10*time.Second /*connTimeoutPerConn*/, 365*24*time.Hour /*connLife*/, tabletTypesToWait, 0 /*maxInFlight*/, "" /*testGateway*/) // vtctld configuration and init vtctld.InitVtctld(ts) vtctld.HandleExplorer("zk", zktopo.NewZkExplorer(zkconn)) servenv.OnTerm(func() { // FIXME(alainjobart): stop vtgate }) servenv.OnClose(func() { log.Infof("Total count of new connections to MySQL: %v", expvar.Get("mysql-new-connection-count")) // We will still use the topo server during lameduck period // to update our state, so closing it in OnClose() topo.CloseServers() }) servenv.RunDefault() }
// init phase: // - read the destination keyspace, make sure it has 'servedFrom' values func (scw *SplitCloneWorker) init(ctx context.Context) error { scw.setState(WorkerStateInit) var err error // read the keyspace and validate it shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) scw.keyspaceInfo, err = scw.wr.TopoServer().GetKeyspace(shortCtx, scw.keyspace) cancel() if err != nil { return fmt.Errorf("cannot read keyspace %v: %v", scw.keyspace, err) } // find the OverlappingShards in the keyspace shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) osList, err := topotools.FindOverlappingShards(shortCtx, scw.wr.TopoServer(), scw.keyspace) cancel() if err != nil { return fmt.Errorf("cannot FindOverlappingShards in %v: %v", scw.keyspace, err) } // find the shard we mentioned in there, if any os := topotools.OverlappingShardsForShard(osList, scw.shard) if os == nil { return fmt.Errorf("the specified shard %v/%v is not in any overlapping shard", scw.keyspace, scw.shard) } scw.wr.Logger().Infof("Found overlapping shards: %+v\n", os) // one side should have served types, the other one none, // figure out wich is which, then double check them all if len(os.Left[0].ServedTypes) > 0 { scw.sourceShards = os.Left scw.destinationShards = os.Right } else { scw.sourceShards = os.Right scw.destinationShards = os.Left } // Verify that filtered replication is not already enabled. for _, si := range scw.destinationShards { if len(si.SourceShards) > 0 { return fmt.Errorf("destination shard %v/%v has filtered replication already enabled from a previous resharding (ShardInfo is set)."+ " This requires manual intervention e.g. use vtctl SourceShardDelete to remove it", si.Keyspace(), si.ShardName()) } } // validate all serving types servingTypes := []topodatapb.TabletType{topodatapb.TabletType_MASTER, topodatapb.TabletType_REPLICA, topodatapb.TabletType_RDONLY} for _, st := range servingTypes { for _, si := range scw.sourceShards { if si.GetServedType(st) == nil { return fmt.Errorf("source shard %v/%v is not serving type %v", si.Keyspace(), si.ShardName(), st) } } } for _, si := range scw.destinationShards { if len(si.ServedTypes) > 0 { return fmt.Errorf("destination shard %v/%v is serving some types", si.Keyspace(), si.ShardName()) } } // read the vschema if needed var keyspaceSchema *vindexes.KeyspaceSchema if *useV3ReshardingMode { kschema, err := scw.wr.TopoServer().GetVSchema(ctx, scw.keyspace) if err != nil { return fmt.Errorf("cannot load VSchema for keyspace %v: %v", scw.keyspace, err) } if kschema == nil { return fmt.Errorf("no VSchema for keyspace %v", scw.keyspace) } keyspaceSchema, err = vindexes.BuildKeyspaceSchema(kschema, scw.keyspace) if err != nil { return fmt.Errorf("cannot build vschema for keyspace %v: %v", scw.keyspace, err) } scw.keyspaceSchema = keyspaceSchema } // Initialize healthcheck and add destination shards to it. scw.healthCheck = discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout) allShards := append(scw.sourceShards, scw.destinationShards...) for _, si := range allShards { watcher := discovery.NewShardReplicationWatcher(scw.wr.TopoServer(), scw.healthCheck, scw.cell, si.Keyspace(), si.ShardName(), *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) scw.shardWatchers = append(scw.shardWatchers, watcher) } return nil }
// TestL2VTGateDiscovery tests the l2vtgate gateway with a gRPC // connection from the gateway to a l2vtgate in-process object. func TestL2VTGateDiscovery(t *testing.T) { flag.Set("tablet_protocol", "grpc") flag.Set("gateway_implementation", "discoverygateway") // Fake services for the tablet, topo server. service, ts, cell := CreateFakeServers(t) // Tablet: listen on a random port. listener, err := net.Listen("tcp", ":0") if err != nil { t.Fatalf("Cannot listen: %v", err) } host := listener.Addr().(*net.TCPAddr).IP.String() port := listener.Addr().(*net.TCPAddr).Port defer listener.Close() // Tablet: create a gRPC server and listen on the port. server := grpc.NewServer() grpcqueryservice.Register(server, service) go server.Serve(listener) defer server.Stop() // L2VTGate: Create the discovery healthcheck, and the gateway. // Wait for the right tablets to be present. hc := discovery.NewHealthCheck(30*time.Second, 10*time.Second, 2*time.Minute) l2vtgate := l2vtgate.Init(hc, ts, ts, "", cell, 2, nil) hc.AddTablet(&topodatapb.Tablet{ Alias: &topodatapb.TabletAlias{ Cell: cell, Uid: 44, }, Keyspace: tabletconntest.TestTarget.Keyspace, Shard: tabletconntest.TestTarget.Shard, Type: tabletconntest.TestTarget.TabletType, Hostname: host, PortMap: map[string]int32{ "grpc": int32(port), }, }, "test_tablet") ctx := context.Background() err = l2vtgate.Gateway().WaitForTablets(ctx, []topodatapb.TabletType{tabletconntest.TestTarget.TabletType}) if err != nil { t.Fatalf("WaitForAllServingTablets failed: %v", err) } // L2VTGate: listen on a random port. listener, err = net.Listen("tcp", ":0") if err != nil { t.Fatalf("Cannot listen: %v", err) } defer listener.Close() // L2VTGate: create a gRPC server and listen on the port. server = grpc.NewServer() grpcqueryservice.Register(server, l2vtgate) go server.Serve(listener) defer server.Stop() // VTGate: create the l2vtgate gateway flag.Set("gateway_implementation", "l2vtgategateway") flag.Set("l2vtgategateway_addrs", fmt.Sprintf("%v|%v|%v", listener.Addr().String(), tabletconntest.TestTarget.Keyspace, tabletconntest.TestTarget.Shard)) lg := gateway.GetCreator()(nil, ts, nil, "", 2) defer lg.Close(ctx) // and run the test suite. TestSuite(t, "l2vtgate-grpc", lg, service) }
func (wr *Wrangler) waitForDrainInCell(ctx context.Context, cell, keyspace, shard string, servedType topodatapb.TabletType, retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout time.Duration) error { hc := discovery.NewHealthCheck(healthCheckTimeout /* connectTimeout */, healthcheckRetryDelay, healthCheckTimeout) defer hc.Close() watcher := discovery.NewShardReplicationWatcher(wr.TopoServer(), hc, cell, keyspace, shard, healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) defer watcher.Stop() if err := discovery.WaitForTablets(ctx, hc, cell, keyspace, shard, []topodatapb.TabletType{servedType}); err != nil { return fmt.Errorf("%v: error waiting for initial %v tablets for %v/%v: %v", cell, servedType, keyspace, shard, err) } wr.Logger().Infof("%v: Waiting for %.1f seconds to make sure that the discovery module retrieves healthcheck information from all tablets.", cell, healthCheckTimeout.Seconds()) // Wait at least for -vtctl_healthcheck_timeout to elapse to make sure that we // see all healthy tablets. Otherwise, we might miss some tablets. // It's safe to wait not longer for this because we would only miss slow // tablets and vtgate would not serve from such tablets anyway. time.Sleep(healthCheckTimeout) // Now check the QPS rate of all tablets until the timeout expires. startTime := time.Now() for { // map key: tablet uid drainedHealthyTablets := make(map[uint32]*discovery.TabletStats) notDrainedHealtyTablets := make(map[uint32]*discovery.TabletStats) healthyTablets := discovery.RemoveUnhealthyTablets( hc.GetTabletStatsFromTarget(keyspace, shard, servedType)) for _, ts := range healthyTablets { if ts.Stats.Qps == 0.0 { drainedHealthyTablets[ts.Tablet.Alias.Uid] = ts } else { notDrainedHealtyTablets[ts.Tablet.Alias.Uid] = ts } } if len(drainedHealthyTablets) == len(healthyTablets) { wr.Logger().Infof("%v: All %d healthy tablets were drained after %.1f seconds (not counting %.1f seconds for the initial wait).", cell, len(healthyTablets), time.Now().Sub(startTime).Seconds(), healthCheckTimeout.Seconds()) break } // Continue waiting, sleep in between. deadlineString := "" if d, ok := ctx.Deadline(); ok { deadlineString = fmt.Sprintf(" up to %.1f more seconds", d.Sub(time.Now()).Seconds()) } wr.Logger().Infof("%v: Waiting%v for all healthy tablets to be drained (%d/%d done).", cell, deadlineString, len(drainedHealthyTablets), len(healthyTablets)) timer := time.NewTimer(retryDelay) select { case <-ctx.Done(): timer.Stop() var l []string for _, ts := range notDrainedHealtyTablets { l = append(l, formatTabletStats(ts)) } return fmt.Errorf("%v: WaitForDrain failed for %v tablets in %v/%v. Only %d/%d tablets were drained. err: %v List of tablets which were not drained: %v", cell, servedType, keyspace, shard, len(drainedHealthyTablets), len(healthyTablets), ctx.Err(), strings.Join(l, ";")) case <-timer.C: } } return nil }
func main() { defer exit.Recover() // flag parsing flags := dbconfigs.AppConfig | dbconfigs.DbaConfig | dbconfigs.FilteredConfig | dbconfigs.ReplConfig dbconfigs.RegisterFlags(flags) mysqlctl.RegisterFlags() flag.Parse() if len(flag.Args()) > 0 { flag.Usage() log.Errorf("vtcombo doesn't take any positional arguments") exit.Return(1) } // set discoverygateway flag to default value flag.Set("cells_to_watch", cell) // register topo server zkconn := fakezk.NewConn() topo.RegisterServer("fakezk", zktopo.NewServer(zkconn)) ts = topo.GetServerByName("fakezk") servenv.Init() tabletserver.Init() // database configs mycnf, err := mysqlctl.NewMycnfFromFlags(0) if err != nil { log.Errorf("mycnf read failed: %v", err) exit.Return(1) } dbcfgs, err := dbconfigs.Init(mycnf.SocketFile, flags) if err != nil { log.Warning(err) } mysqld := mysqlctl.NewMysqld("Dba", "App", mycnf, &dbcfgs.Dba, &dbcfgs.App.ConnParams, &dbcfgs.Repl) servenv.OnClose(mysqld.Close) // tablets configuration and init if err := initTabletMap(ts, *protoTopo, mysqld, dbcfgs, *schemaDir, mycnf); err != nil { log.Errorf("initTabletMapProto failed: %v", err) exit.Return(1) } // vtgate configuration and init resilientSrvTopoServer := vtgate.NewResilientSrvTopoServer(ts, "ResilientSrvTopoServer") healthCheck := discovery.NewHealthCheck(30*time.Second /*connTimeoutTotal*/, 1*time.Millisecond /*retryDelay*/, 1*time.Hour /*healthCheckTimeout*/) tabletTypesToWait := []topodatapb.TabletType{ topodatapb.TabletType_MASTER, topodatapb.TabletType_REPLICA, topodatapb.TabletType_RDONLY, } vtgate.Init(context.Background(), healthCheck, ts, resilientSrvTopoServer, cell, 2 /*retryCount*/, tabletTypesToWait) // vtctld configuration and init vtctld.InitVtctld(ts) vtctld.HandleExplorer("zk", zktopo.NewZkExplorer(zkconn)) servenv.OnTerm(func() { // FIXME(alainjobart): stop vtgate }) servenv.OnClose(func() { // We will still use the topo server during lameduck period // to update our state, so closing it in OnClose() topo.CloseServers() }) servenv.RunDefault() }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'worker' pointing back to us // - get the aliases of all the targets func (scw *LegacySplitCloneWorker) findTargets(ctx context.Context) error { scw.setState(WorkerStateFindTargets) var err error // find an appropriate tablet in the source shards scw.sourceAliases = make([]*topodatapb.TabletAlias, len(scw.sourceShards)) for i, si := range scw.sourceShards { scw.sourceAliases[i], err = FindWorkerTablet(ctx, scw.wr, scw.cleaner, scw.tsc, scw.cell, si.Keyspace(), si.ShardName(), scw.minHealthyRdonlyTablets) if err != nil { return fmt.Errorf("FindWorkerTablet() failed for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", topoproto.TabletAliasString(scw.sourceAliases[i]), si.Keyspace(), si.ShardName()) } // get the tablet info for them, and stop their replication scw.sourceTablets = make([]*topodatapb.Tablet, len(scw.sourceAliases)) for i, alias := range scw.sourceAliases { shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) ti, err := scw.wr.TopoServer().GetTablet(shortCtx, alias) cancel() if err != nil { return fmt.Errorf("cannot read tablet %v: %v", topoproto.TabletAliasString(alias), err) } scw.sourceTablets[i] = ti.Tablet shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = scw.wr.TabletManagerClient().StopSlave(shortCtx, scw.sourceTablets[i]) cancel() if err != nil { return fmt.Errorf("cannot stop replication on tablet %v", topoproto.TabletAliasString(alias)) } wrangler.RecordStartSlaveAction(scw.cleaner, scw.sourceTablets[i]) } // Initialize healthcheck and add destination shards to it. scw.healthCheck = discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout) scw.tsc = discovery.NewTabletStatsCache(scw.healthCheck, scw.cell) for _, si := range scw.destinationShards { watcher := discovery.NewShardReplicationWatcher(scw.wr.TopoServer(), scw.healthCheck, scw.cell, si.Keyspace(), si.ShardName(), *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) scw.destinationShardWatchers = append(scw.destinationShardWatchers, watcher) } // Make sure we find a master for each destination shard and log it. scw.wr.Logger().Infof("Finding a MASTER tablet for each destination shard...") for _, si := range scw.destinationShards { waitCtx, waitCancel := context.WithTimeout(ctx, 10*time.Second) defer waitCancel() if err := scw.tsc.WaitForTablets(waitCtx, scw.cell, si.Keyspace(), si.ShardName(), []topodatapb.TabletType{topodatapb.TabletType_MASTER}); err != nil { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v: %v", si.Keyspace(), si.ShardName(), err) } masters := scw.tsc.GetHealthyTabletStats(si.Keyspace(), si.ShardName(), topodatapb.TabletType_MASTER) if len(masters) == 0 { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v in HealthCheck: empty TabletStats list", si.Keyspace(), si.ShardName()) } master := masters[0] // Get the MySQL database name of the tablet. shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) ti, err := scw.wr.TopoServer().GetTablet(shortCtx, master.Tablet.Alias) cancel() if err != nil { return fmt.Errorf("cannot get the TabletInfo for destination master (%v) to find out its db name: %v", topoproto.TabletAliasString(master.Tablet.Alias), err) } keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()) scw.destinationDbNames[keyspaceAndShard] = ti.DbName() // TODO(mberlin): Verify on the destination master that the // _vt.blp_checkpoint table has the latest schema. scw.wr.Logger().Infof("Using tablet %v as destination master for %v/%v", topoproto.TabletAliasString(master.Tablet.Alias), si.Keyspace(), si.ShardName()) } scw.wr.Logger().Infof("NOTE: The used master of a destination shard might change over the course of the copy e.g. due to a reparent. The HealthCheck module will track and log master changes and any error message will always refer the actually used master address.") // Set up the throttler for each destination shard. for _, si := range scw.destinationShards { keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()) t, err := throttler.NewThrottler( keyspaceAndShard, "transactions", scw.destinationWriterCount, scw.maxTPS, throttler.ReplicationLagModuleDisabled) if err != nil { return fmt.Errorf("cannot instantiate throttler: %v", err) } scw.destinationThrottlers[keyspaceAndShard] = t } return nil }