// synchronizeReplication phase: // 1 - ask the subset slave to stop replication // 2 - sleep for 5 seconds // 3 - ask the superset slave to stop replication // Note this is not 100% correct, but good enough for now func (worker *SQLDiffWorker) synchronizeReplication(ctx context.Context) error { worker.SetState(WorkerStateSyncReplication) // stop replication on subset slave worker.wr.Logger().Infof("Stopping replication on subset slave %v", worker.subset.alias) subsetTablet, err := worker.wr.TopoServer().GetTablet(ctx, worker.subset.alias) if err != nil { return err } shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err = worker.wr.TabletManagerClient().StopSlave(shortCtx, subsetTablet) cancel() if err != nil { return fmt.Errorf("Cannot stop slave %v: %v", worker.subset.alias, err) } if err := checkDone(ctx); err != nil { return err } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(worker.cleaner, subsetTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.subset.alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", worker.subset.alias, err) } action.TabletType = topo.TYPE_SPARE // sleep for a few seconds time.Sleep(5 * time.Second) if err := checkDone(ctx); err != nil { return err } // stop replication on superset slave worker.wr.Logger().Infof("Stopping replication on superset slave %v", worker.superset.alias) supersetTablet, err := worker.wr.TopoServer().GetTablet(ctx, worker.superset.alias) if err != nil { return err } shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = worker.wr.TabletManagerClient().StopSlave(shortCtx, supersetTablet) cancel() if err != nil { return fmt.Errorf("Cannot stop slave %v: %v", worker.superset.alias, err) } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(worker.cleaner, supersetTablet) action, err = wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.superset.alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", worker.superset.alias, err) } action.TabletType = topo.TYPE_SPARE return nil }
// synchronizeReplication phase: // 1 - ask the subset slave to stop replication // 2 - sleep for 5 seconds // 3 - ask the superset slave to stop replication // Note this is not 100% correct, but good enough for now func (worker *SQLDiffWorker) synchronizeReplication() error { worker.setState(SQLDiffSynchronizeReplication) // stop replication on subset slave worker.wr.Logger().Infof("Stopping replication on subset slave %v", worker.subset.alias) subsetTablet, err := worker.wr.TopoServer().GetTablet(worker.subset.alias) if err != nil { return err } if err := worker.wr.ActionInitiator().StopSlave(subsetTablet, 30*time.Second); err != nil { return fmt.Errorf("Cannot stop slave %v: %v", worker.subset.alias, err) } if worker.CheckInterrupted() { return topo.ErrInterrupted } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(worker.cleaner, worker.subset.alias, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.subset.alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", worker.subset.alias, err) } action.TabletType = topo.TYPE_SPARE // sleep for a few seconds time.Sleep(5 * time.Second) if worker.CheckInterrupted() { return topo.ErrInterrupted } // stop replication on superset slave worker.wr.Logger().Infof("Stopping replication on superset slave %v", worker.superset.alias) supersetTablet, err := worker.wr.TopoServer().GetTablet(worker.superset.alias) if err != nil { return err } if err := worker.wr.ActionInitiator().StopSlave(supersetTablet, 30*time.Second); err != nil { return fmt.Errorf("Cannot stop slave %v: %v", worker.superset.alias, err) } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(worker.cleaner, worker.superset.alias, 30*time.Second) action, err = wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.superset.alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", worker.superset.alias, err) } action.TabletType = topo.TYPE_SPARE return nil }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'worker' pointing back to us // - get the aliases of all the targets func (vscw *VerticalSplitCloneWorker) findTargets(ctx context.Context) error { vscw.setState(WorkerStateFindTargets) // find an appropriate endpoint in the source shard var err error vscw.sourceAlias, err = FindWorkerTablet(ctx, vscw.wr, vscw.cleaner, vscw.cell, vscw.sourceKeyspace, "0") if err != nil { return fmt.Errorf("FindWorkerTablet() failed for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err) } vscw.wr.Logger().Infof("Using tablet %v as the source", topo.TabletAliasString(vscw.sourceAlias)) // get the tablet info for it vscw.sourceTablet, err = vscw.wr.TopoServer().GetTablet(ctx, vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", topo.TabletAliasString(vscw.sourceAlias), err) } // stop replication on it shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err = vscw.wr.TabletManagerClient().StopSlave(shortCtx, vscw.sourceTablet) cancel() if err != nil { return fmt.Errorf("cannot stop replication on tablet %v", topo.TabletAliasString(vscw.sourceAlias)) } wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vscw.cleaner, vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topo.TabletAliasString(vscw.sourceAlias), err) } action.TabletType = pb.TabletType_SPARE return vscw.ResolveDestinationMasters(ctx) }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'checker' pointing back to us // - get the aliases of all the targets func (vscw *VerticalSplitCloneWorker) findTargets() error { vscw.setState(stateVSCFindTargets) // find an appropriate endpoint in the source shard var err error vscw.sourceAlias, err = findChecker(vscw.wr, vscw.cleaner, vscw.cell, vscw.sourceKeyspace, "0") if err != nil { return fmt.Errorf("cannot find checker for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err) } vscw.wr.Logger().Infof("Using tablet %v as the source", vscw.sourceAlias) // get the tablet info for it vscw.sourceTablet, err = vscw.wr.TopoServer().GetTablet(vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", vscw.sourceTablet, err) } // stop replication on it if err := vscw.wr.TabletManagerClient().StopSlave(vscw.sourceTablet, 30*time.Second); err != nil { return fmt.Errorf("cannot stop replication on tablet %v", vscw.sourceAlias) } wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vscw.cleaner, vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vscw.sourceAlias, err) } action.TabletType = topo.TYPE_SPARE // find all the targets in the destination keyspace / shard vscw.destinationAliases, err = topo.FindAllTabletAliasesInShard(vscw.wr.TopoServer(), vscw.destinationKeyspace, vscw.destinationShard) if err != nil { return fmt.Errorf("cannot find all target tablets in %v/%v: %v", vscw.destinationKeyspace, vscw.destinationShard, err) } vscw.wr.Logger().Infof("Found %v target aliases", len(vscw.destinationAliases)) // get the TabletInfo for all targets vscw.destinationTablets, err = topo.GetTabletMap(vscw.wr.TopoServer(), vscw.destinationAliases) if err != nil { return fmt.Errorf("cannot read all target tablets in %v/%v: %v", vscw.destinationKeyspace, vscw.destinationShard, err) } // find and validate the master for tabletAlias, ti := range vscw.destinationTablets { if ti.Type == topo.TYPE_MASTER { if vscw.destinationMasterAlias.IsZero() { vscw.destinationMasterAlias = tabletAlias } else { return fmt.Errorf("multiple masters in destination shard: %v and %v at least", vscw.destinationMasterAlias, tabletAlias) } } } if vscw.destinationMasterAlias.IsZero() { return fmt.Errorf("no master in destination shard") } return nil }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'worker' pointing back to us // - get the aliases of all the targets func (scw *SplitCloneWorker) findTargets(ctx context.Context) error { scw.setState(WorkerStateFindTargets) var err error // find an appropriate endpoint in the source shards scw.sourceAliases = make([]*pb.TabletAlias, len(scw.sourceShards)) for i, si := range scw.sourceShards { scw.sourceAliases[i], err = FindWorkerTablet(ctx, scw.wr, scw.cleaner, scw.cell, si.Keyspace(), si.ShardName()) if err != nil { return fmt.Errorf("FindWorkerTablet() failed for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", topoproto.TabletAliasString(scw.sourceAliases[i]), si.Keyspace(), si.ShardName()) } // get the tablet info for them, and stop their replication scw.sourceTablets = make([]*topo.TabletInfo, len(scw.sourceAliases)) for i, alias := range scw.sourceAliases { shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) scw.sourceTablets[i], err = scw.wr.TopoServer().GetTablet(shortCtx, alias) cancel() if err != nil { return fmt.Errorf("cannot read tablet %v: %v", topoproto.TabletAliasString(alias), err) } shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.TabletManagerClient().StopSlave(shortCtx, scw.sourceTablets[i]) cancel() if err != nil { return fmt.Errorf("cannot stop replication on tablet %v", topoproto.TabletAliasString(alias)) } wrangler.RecordStartSlaveAction(scw.cleaner, scw.sourceTablets[i]) action, err := wrangler.FindChangeSlaveTypeActionByTarget(scw.cleaner, alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topoproto.TabletAliasString(alias), err) } action.TabletType = pb.TabletType_SPARE } return scw.ResolveDestinationMasters(ctx) }
// findOfflineSourceTablets phase: // - find one rdonly in the source shard // - mark it as 'worker' pointing back to us // - get the aliases of all the source tablets func (scw *SplitCloneWorker) findOfflineSourceTablets(ctx context.Context) error { scw.setState(WorkerStateFindTargets) // find an appropriate tablet in the source shards scw.offlineSourceAliases = make([]*topodatapb.TabletAlias, len(scw.sourceShards)) for i, si := range scw.sourceShards { var err error scw.offlineSourceAliases[i], err = FindWorkerTablet(ctx, scw.wr, scw.cleaner, scw.tsc, scw.cell, si.Keyspace(), si.ShardName(), scw.minHealthyRdonlyTablets) if err != nil { return fmt.Errorf("FindWorkerTablet() failed for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", topoproto.TabletAliasString(scw.offlineSourceAliases[i]), si.Keyspace(), si.ShardName()) } scw.setFormattedOfflineSources(scw.offlineSourceAliases) // get the tablet info for them, and stop their replication scw.sourceTablets = make([]*topodatapb.Tablet, len(scw.offlineSourceAliases)) for i, alias := range scw.offlineSourceAliases { shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) ti, err := scw.wr.TopoServer().GetTablet(shortCtx, alias) cancel() if err != nil { return fmt.Errorf("cannot read tablet %v: %v", topoproto.TabletAliasString(alias), err) } scw.sourceTablets[i] = ti.Tablet shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = scw.wr.TabletManagerClient().StopSlave(shortCtx, scw.sourceTablets[i]) cancel() if err != nil { return fmt.Errorf("cannot stop replication on tablet %v", topoproto.TabletAliasString(alias)) } wrangler.RecordStartSlaveAction(scw.cleaner, scw.sourceTablets[i]) } return nil }
func (sdw *SplitDiffWorker) synchronizeReplication(ctx context.Context) error { sdw.SetState(WorkerStateSyncReplication) masterInfo, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.shardInfo.MasterAlias) if err != nil { return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", sdw.shardInfo.MasterAlias, err) } // 1 - stop the master binlog replication, get its current position sdw.wr.Logger().Infof("Stopping master binlog replication on %v", sdw.shardInfo.MasterAlias) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) blpPositionList, err := sdw.wr.TabletManagerClient().StopBlp(shortCtx, masterInfo) cancel() if err != nil { return fmt.Errorf("StopBlp for %v failed: %v", sdw.shardInfo.MasterAlias, err) } wrangler.RecordStartBlpAction(sdw.cleaner, masterInfo) // 2 - stop all the source tablets at a binlog position // higher than the destination master stopPositionList := blproto.BlpPositionList{ Entries: make([]blproto.BlpPosition, len(sdw.shardInfo.SourceShards)), } for i, ss := range sdw.shardInfo.SourceShards { // find where we should be stopping blpPos, err := blpPositionList.FindBlpPositionById(ss.Uid) if err != nil { return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid) } // read the tablet sourceTablet, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.sourceAliases[i]) if err != nil { return err } // stop replication sdw.wr.Logger().Infof("Stopping slave[%v] %v at a minimum of %v", i, sdw.sourceAliases[i], blpPos.Position) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) stoppedAt, err := sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, sourceTablet, blpPos.Position, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", sdw.sourceAliases[i], blpPos.Position, err) } stopPositionList.Entries[i].Uid = ss.Uid stopPositionList.Entries[i].Position = stoppedAt // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(sdw.cleaner, sourceTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.sourceAliases[i]) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.sourceAliases[i], err) } action.TabletType = pb.TabletType_SPARE } // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions sdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", sdw.shardInfo.MasterAlias, stopPositionList) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) masterPos, err := sdw.wr.TabletManagerClient().RunBlpUntil(shortCtx, masterInfo, &stopPositionList, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("RunBlpUntil for %v until %v failed: %v", sdw.shardInfo.MasterAlias, stopPositionList, err) } // 4 - wait until the destination tablet is equal or passed // that master binlog position, and stop its replication. sdw.wr.Logger().Infof("Waiting for destination tablet %v to catch up to %v", sdw.destinationAlias, masterPos) destinationTablet, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.destinationAlias) if err != nil { return err } shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) _, err = sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, destinationTablet, masterPos, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("StopSlaveMinimum for %v at %v failed: %v", sdw.destinationAlias, masterPos, err) } wrangler.RecordStartSlaveAction(sdw.cleaner, destinationTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.destinationAlias, err) } action.TabletType = pb.TabletType_SPARE // 5 - restart filtered replication on destination master sdw.wr.Logger().Infof("Restarting filtered replication on master %v", sdw.shardInfo.MasterAlias) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = sdw.wr.TabletManagerClient().StartBlp(shortCtx, masterInfo) if err := sdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, topo.TabletAliasString(sdw.shardInfo.MasterAlias)); err != nil { sdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, topo.TabletAliasString(sdw.shardInfo.MasterAlias), err) } cancel() if err != nil { return fmt.Errorf("StartBlp failed for %v: %v", sdw.shardInfo.MasterAlias, err) } return nil }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'worker' pointing back to us // - get the aliases of all the targets func (vscw *VerticalSplitCloneWorker) findTargets(ctx context.Context) error { vscw.setState(WorkerStateFindTargets) // find an appropriate tablet in the source shard var err error vscw.sourceAlias, err = FindWorkerTablet(ctx, vscw.wr, vscw.cleaner, nil /* tsc */, vscw.cell, vscw.sourceKeyspace, "0", vscw.minHealthyRdonlyTablets) if err != nil { return fmt.Errorf("FindWorkerTablet() failed for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err) } vscw.wr.Logger().Infof("Using tablet %v as the source", topoproto.TabletAliasString(vscw.sourceAlias)) // get the tablet info for it shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) ti, err := vscw.wr.TopoServer().GetTablet(shortCtx, vscw.sourceAlias) cancel() if err != nil { return fmt.Errorf("cannot read tablet %v: %v", topoproto.TabletAliasString(vscw.sourceAlias), err) } vscw.sourceTablet = ti.Tablet // stop replication on it shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = vscw.wr.TabletManagerClient().StopSlave(shortCtx, vscw.sourceTablet) cancel() if err != nil { return fmt.Errorf("cannot stop replication on tablet %v", topoproto.TabletAliasString(vscw.sourceAlias)) } wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet) // Initialize healthcheck and add destination shards to it. vscw.healthCheck = discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout) vscw.tsc = discovery.NewTabletStatsCache(vscw.healthCheck, vscw.cell) watcher := discovery.NewShardReplicationWatcher(vscw.wr.TopoServer(), vscw.healthCheck, vscw.cell, vscw.destinationKeyspace, vscw.destinationShard, *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) vscw.destinationShardWatchers = append(vscw.destinationShardWatchers, watcher) // Make sure we find a master for each destination shard and log it. vscw.wr.Logger().Infof("Finding a MASTER tablet for each destination shard...") waitCtx, waitCancel := context.WithTimeout(ctx, *waitForHealthyTabletsTimeout) defer waitCancel() if err := vscw.tsc.WaitForTablets(waitCtx, vscw.cell, vscw.destinationKeyspace, vscw.destinationShard, []topodatapb.TabletType{topodatapb.TabletType_MASTER}); err != nil { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v (in cell: %v): %v", vscw.destinationKeyspace, vscw.destinationShard, vscw.cell, err) } masters := vscw.tsc.GetHealthyTabletStats(vscw.destinationKeyspace, vscw.destinationShard, topodatapb.TabletType_MASTER) if len(masters) == 0 { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v (in cell: %v) in HealthCheck: empty TabletStats list", vscw.destinationKeyspace, vscw.destinationShard, vscw.cell) } master := masters[0] // Get the MySQL database name of the tablet. keyspaceAndShard := topoproto.KeyspaceShardString(vscw.destinationKeyspace, vscw.destinationShard) vscw.destinationDbNames[keyspaceAndShard] = topoproto.TabletDbName(master.Tablet) // TODO(mberlin): Verify on the destination master that the // _vt.blp_checkpoint table has the latest schema. vscw.wr.Logger().Infof("Using tablet %v as destination master for %v/%v", topoproto.TabletAliasString(master.Tablet.Alias), vscw.destinationKeyspace, vscw.destinationShard) vscw.wr.Logger().Infof("NOTE: The used master of a destination shard might change over the course of the copy e.g. due to a reparent. The HealthCheck module will track and log master changes and any error message will always refer the actually used master address.") return nil }
func (vsdw *VerticalSplitDiffWorker) synchronizeReplication() error { vsdw.setState(stateVSDSynchronizeReplication) // 1 - stop the master binlog replication, get its current position vsdw.wr.Logger().Infof("Stopping master binlog replication on %v", vsdw.shardInfo.MasterAlias) blpPositionList, err := vsdw.wr.ActionInitiator().StopBlp(vsdw.shardInfo.MasterAlias, 30*time.Second) if err != nil { return fmt.Errorf("StopBlp on master %v failed: %v", vsdw.shardInfo.MasterAlias, err) } wrangler.RecordStartBlpAction(vsdw.cleaner, vsdw.shardInfo.MasterAlias, 30*time.Second) // 2 - stop the source 'checker' at a binlog position // higher than the destination master stopPositionList := blproto.BlpPositionList{ Entries: make([]blproto.BlpPosition, 1), } ss := vsdw.shardInfo.SourceShards[0] // find where we should be stopping pos, err := blpPositionList.FindBlpPositionById(ss.Uid) if err != nil { return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid) } // stop replication vsdw.wr.Logger().Infof("Stopping slave %v at a minimum of %v", vsdw.sourceAlias, pos.Position) stoppedAt, err := vsdw.wr.ActionInitiator().StopSlaveMinimum(vsdw.sourceAlias, pos.Position, 30*time.Second) if err != nil { return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", vsdw.sourceAlias, pos.Position, err) } stopPositionList.Entries[0].Uid = ss.Uid stopPositionList.Entries[0].Position = stoppedAt.Position // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(vsdw.cleaner, vsdw.sourceAlias, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vsdw.sourceAlias, err) } action.TabletType = topo.TYPE_SPARE // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions vsdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", vsdw.shardInfo.MasterAlias, stopPositionList) masterPos, err := vsdw.wr.ActionInitiator().RunBlpUntil(vsdw.shardInfo.MasterAlias, &stopPositionList, 30*time.Second) if err != nil { return fmt.Errorf("RunBlpUntil on %v until %v failed: %v", vsdw.shardInfo.MasterAlias, stopPositionList, err) } // 4 - wait until the destination checker is equal or passed // that master binlog position, and stop its replication. vsdw.wr.Logger().Infof("Waiting for destination checker %v to catch up to %v", vsdw.destinationAlias, masterPos) _, err = vsdw.wr.ActionInitiator().StopSlaveMinimum(vsdw.destinationAlias, masterPos, 30*time.Second) if err != nil { return fmt.Errorf("StopSlaveMinimum on %v at %v failed: %v", vsdw.destinationAlias, masterPos, err) } wrangler.RecordStartSlaveAction(vsdw.cleaner, vsdw.destinationAlias, 30*time.Second) action, err = wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vsdw.destinationAlias, err) } action.TabletType = topo.TYPE_SPARE // 5 - restart filtered replication on destination master vsdw.wr.Logger().Infof("Restarting filtered replication on master %v", vsdw.shardInfo.MasterAlias) err = vsdw.wr.ActionInitiator().StartBlp(vsdw.shardInfo.MasterAlias, 30*time.Second) if err := vsdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, vsdw.shardInfo.MasterAlias.String()); err != nil { vsdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, vsdw.shardInfo.MasterAlias.String(), err) } if err != nil { return fmt.Errorf("StartBlp on %v failed: %v", vsdw.shardInfo.MasterAlias, err) } return nil }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'worker' pointing back to us // - get the aliases of all the targets func (scw *LegacySplitCloneWorker) findTargets(ctx context.Context) error { scw.setState(WorkerStateFindTargets) var err error // find an appropriate tablet in the source shards scw.sourceAliases = make([]*topodatapb.TabletAlias, len(scw.sourceShards)) for i, si := range scw.sourceShards { scw.sourceAliases[i], err = FindWorkerTablet(ctx, scw.wr, scw.cleaner, scw.tsc, scw.cell, si.Keyspace(), si.ShardName(), scw.minHealthyRdonlyTablets) if err != nil { return fmt.Errorf("FindWorkerTablet() failed for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", topoproto.TabletAliasString(scw.sourceAliases[i]), si.Keyspace(), si.ShardName()) } // get the tablet info for them, and stop their replication scw.sourceTablets = make([]*topodatapb.Tablet, len(scw.sourceAliases)) for i, alias := range scw.sourceAliases { shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) ti, err := scw.wr.TopoServer().GetTablet(shortCtx, alias) cancel() if err != nil { return fmt.Errorf("cannot read tablet %v: %v", topoproto.TabletAliasString(alias), err) } scw.sourceTablets[i] = ti.Tablet shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = scw.wr.TabletManagerClient().StopSlave(shortCtx, scw.sourceTablets[i]) cancel() if err != nil { return fmt.Errorf("cannot stop replication on tablet %v", topoproto.TabletAliasString(alias)) } wrangler.RecordStartSlaveAction(scw.cleaner, scw.sourceTablets[i]) } // Initialize healthcheck and add destination shards to it. scw.healthCheck = discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout) scw.tsc = discovery.NewTabletStatsCache(scw.healthCheck, scw.cell) for _, si := range scw.destinationShards { watcher := discovery.NewShardReplicationWatcher(scw.wr.TopoServer(), scw.healthCheck, scw.cell, si.Keyspace(), si.ShardName(), *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) scw.destinationShardWatchers = append(scw.destinationShardWatchers, watcher) } // Make sure we find a master for each destination shard and log it. scw.wr.Logger().Infof("Finding a MASTER tablet for each destination shard...") for _, si := range scw.destinationShards { waitCtx, waitCancel := context.WithTimeout(ctx, 10*time.Second) defer waitCancel() if err := scw.tsc.WaitForTablets(waitCtx, scw.cell, si.Keyspace(), si.ShardName(), []topodatapb.TabletType{topodatapb.TabletType_MASTER}); err != nil { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v: %v", si.Keyspace(), si.ShardName(), err) } masters := scw.tsc.GetHealthyTabletStats(si.Keyspace(), si.ShardName(), topodatapb.TabletType_MASTER) if len(masters) == 0 { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v in HealthCheck: empty TabletStats list", si.Keyspace(), si.ShardName()) } master := masters[0] // Get the MySQL database name of the tablet. shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) ti, err := scw.wr.TopoServer().GetTablet(shortCtx, master.Tablet.Alias) cancel() if err != nil { return fmt.Errorf("cannot get the TabletInfo for destination master (%v) to find out its db name: %v", topoproto.TabletAliasString(master.Tablet.Alias), err) } keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()) scw.destinationDbNames[keyspaceAndShard] = ti.DbName() // TODO(mberlin): Verify on the destination master that the // _vt.blp_checkpoint table has the latest schema. scw.wr.Logger().Infof("Using tablet %v as destination master for %v/%v", topoproto.TabletAliasString(master.Tablet.Alias), si.Keyspace(), si.ShardName()) } scw.wr.Logger().Infof("NOTE: The used master of a destination shard might change over the course of the copy e.g. due to a reparent. The HealthCheck module will track and log master changes and any error message will always refer the actually used master address.") // Set up the throttler for each destination shard. for _, si := range scw.destinationShards { keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()) t, err := throttler.NewThrottler( keyspaceAndShard, "transactions", scw.destinationWriterCount, scw.maxTPS, throttler.ReplicationLagModuleDisabled) if err != nil { return fmt.Errorf("cannot instantiate throttler: %v", err) } scw.destinationThrottlers[keyspaceAndShard] = t } return nil }
func (vsdw *VerticalSplitDiffWorker) synchronizeReplication(ctx context.Context) error { vsdw.SetState(WorkerStateSyncReplication) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) masterInfo, err := vsdw.wr.TopoServer().GetTablet(shortCtx, vsdw.shardInfo.MasterAlias) cancel() if err != nil { return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err) } // 1 - stop the master binlog replication, get its current position vsdw.wr.Logger().Infof("Stopping master binlog replication on %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias)) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) blpPositionList, err := vsdw.wr.TabletManagerClient().StopBlp(shortCtx, masterInfo) cancel() if err != nil { return fmt.Errorf("StopBlp on master %v failed: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err) } wrangler.RecordStartBlpAction(vsdw.cleaner, masterInfo) // 2 - stop the source tablet at a binlog position // higher than the destination master stopPositionList := make([]*tabletmanagerdatapb.BlpPosition, 1) ss := vsdw.shardInfo.SourceShards[0] // find where we should be stopping blpPos := tmutils.FindBlpPositionByID(blpPositionList, ss.Uid) if blpPos == nil { return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid) } // stop replication vsdw.wr.Logger().Infof("Stopping slave %v at a minimum of %v", topoproto.TabletAliasString(vsdw.sourceAlias), blpPos.Position) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) sourceTablet, err := vsdw.wr.TopoServer().GetTablet(shortCtx, vsdw.sourceAlias) cancel() if err != nil { return err } shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) stoppedAt, err := vsdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, sourceTablet, blpPos.Position, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", topoproto.TabletAliasString(vsdw.sourceAlias), blpPos.Position, err) } stopPositionList[0] = &tabletmanagerdatapb.BlpPosition{ Uid: ss.Uid, Position: stoppedAt, } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(vsdw.cleaner, sourceTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topoproto.TabletAliasString(vsdw.sourceAlias), err) } action.TabletType = topodatapb.TabletType_SPARE // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions vsdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), stopPositionList) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) masterPos, err := vsdw.wr.TabletManagerClient().RunBlpUntil(shortCtx, masterInfo, stopPositionList, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("RunBlpUntil on %v until %v failed: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), stopPositionList, err) } // 4 - wait until the destination tablet is equal or passed // that master binlog position, and stop its replication. vsdw.wr.Logger().Infof("Waiting for destination tablet %v to catch up to %v", topoproto.TabletAliasString(vsdw.destinationAlias), masterPos) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) destinationTablet, err := vsdw.wr.TopoServer().GetTablet(shortCtx, vsdw.destinationAlias) cancel() if err != nil { return err } shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) _, err = vsdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, destinationTablet, masterPos, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("StopSlaveMinimum on %v at %v failed: %v", topoproto.TabletAliasString(vsdw.destinationAlias), masterPos, err) } wrangler.RecordStartSlaveAction(vsdw.cleaner, destinationTablet) action, err = wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topoproto.TabletAliasString(vsdw.destinationAlias), err) } action.TabletType = topodatapb.TabletType_SPARE // 5 - restart filtered replication on destination master vsdw.wr.Logger().Infof("Restarting filtered replication on master %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias)) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = vsdw.wr.TabletManagerClient().StartBlp(shortCtx, masterInfo) if err := vsdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias)); err != nil { vsdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err) } cancel() if err != nil { return fmt.Errorf("StartBlp on %v failed: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err) } return nil }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'checker' pointing back to us // - get the aliases of all the targets func (scw *SplitCloneWorker) findTargets() error { scw.setState(stateSCFindTargets) var err error // find an appropriate endpoint in the source shards scw.sourceAliases = make([]topo.TabletAlias, len(scw.sourceShards)) for i, si := range scw.sourceShards { scw.sourceAliases[i], err = findChecker(scw.wr, scw.cleaner, scw.cell, si.Keyspace(), si.ShardName()) if err != nil { return fmt.Errorf("cannot find checker for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", scw.sourceAliases[i], si.Keyspace(), si.ShardName()) } // get the tablet info for them, and stop their replication scw.sourceTablets = make([]*topo.TabletInfo, len(scw.sourceAliases)) for i, alias := range scw.sourceAliases { scw.sourceTablets[i], err = scw.wr.TopoServer().GetTablet(alias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", alias, err) } if err := scw.wr.TabletManagerClient().StopSlave(scw.sourceTablets[i], 30*time.Second); err != nil { return fmt.Errorf("cannot stop replication on tablet %v", alias) } wrangler.RecordStartSlaveAction(scw.cleaner, scw.sourceTablets[i], 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(scw.cleaner, alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", alias, err) } action.TabletType = topo.TYPE_SPARE } // find all the targets in the destination shards scw.destinationAliases = make([][]topo.TabletAlias, len(scw.destinationShards)) scw.destinationTablets = make([]map[topo.TabletAlias]*topo.TabletInfo, len(scw.destinationShards)) scw.destinationMasterAliases = make([]topo.TabletAlias, len(scw.destinationShards)) for shardIndex, si := range scw.destinationShards { scw.destinationAliases[shardIndex], err = topo.FindAllTabletAliasesInShard(scw.wr.TopoServer(), si.Keyspace(), si.ShardName()) if err != nil { return fmt.Errorf("cannot find all target tablets in %v/%v: %v", si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Found %v target aliases in shard %v/%v", len(scw.destinationAliases[shardIndex]), si.Keyspace(), si.ShardName()) // get the TabletInfo for all targets scw.destinationTablets[shardIndex], err = topo.GetTabletMap(scw.wr.TopoServer(), scw.destinationAliases[shardIndex]) if err != nil { return fmt.Errorf("cannot read all target tablets in %v/%v: %v", si.Keyspace(), si.ShardName(), err) } // find and validate the master for tabletAlias, ti := range scw.destinationTablets[shardIndex] { if ti.Type == topo.TYPE_MASTER { if scw.destinationMasterAliases[shardIndex].IsZero() { scw.destinationMasterAliases[shardIndex] = tabletAlias } else { return fmt.Errorf("multiple masters in destination shard: %v and %v at least", scw.destinationMasterAliases[shardIndex], tabletAlias) } } } if scw.destinationMasterAliases[shardIndex].IsZero() { return fmt.Errorf("no master in destination shard") } } return nil }
func (sdw *SplitDiffWorker) synchronizeReplication() error { sdw.setState(stateSDSynchronizeReplication) // 1 - stop the master binlog replication, get its current position log.Infof("Stopping master binlog replication on %v", sdw.shardInfo.MasterAlias) blpPositionList, err := sdw.wr.ActionInitiator().StopBlp(sdw.shardInfo.MasterAlias, 30*time.Second) if err != nil { return err } wrangler.RecordStartBlpAction(sdw.cleaner, sdw.shardInfo.MasterAlias, 30*time.Second) // 2 - stop all the source 'checker' at a binlog position // higher than the destination master stopPositionList := myproto.BlpPositionList{ Entries: make([]myproto.BlpPosition, len(sdw.shardInfo.SourceShards)), } for i, ss := range sdw.shardInfo.SourceShards { // find where we should be stopping pos, err := blpPositionList.FindBlpPositionById(ss.Uid) if err != nil { return fmt.Errorf("No binlog position on the master for Uid %v", ss.Uid) } // stop replication log.Infof("Stopping slave[%v] %v at a minimum of %v", i, sdw.sourceAliases[i], pos.GroupId) stoppedAt, err := sdw.wr.ActionInitiator().StopSlaveMinimum(sdw.sourceAliases[i], pos.GroupId, 30*time.Second) if err != nil { return fmt.Errorf("Cannot stop slave %v at right binlog position %v: %v", sdw.sourceAliases[i], pos.GroupId, err) } stopPositionList.Entries[i].Uid = ss.Uid stopPositionList.Entries[i].GroupId = stoppedAt.MasterLogGroupId // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(sdw.cleaner, sdw.sourceAliases[i], 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.sourceAliases[i]) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.sourceAliases[i], err) } action.TabletType = topo.TYPE_SPARE } // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions log.Infof("Restarting master %v until it catches up to %v", sdw.shardInfo.MasterAlias, stopPositionList) masterPos, err := sdw.wr.ActionInitiator().RunBlpUntil(sdw.shardInfo.MasterAlias, &stopPositionList, 30*time.Second) if err != nil { return err } // 4 - wait until the destination checker is equal or passed // that master binlog position, and stop its replication. log.Infof("Waiting for destination checker %v to catch up to %v", sdw.destinationAlias, masterPos.MasterLogGroupId) _, err = sdw.wr.ActionInitiator().StopSlaveMinimum(sdw.destinationAlias, masterPos.MasterLogGroupId, 30*time.Second) if err != nil { return err } wrangler.RecordStartSlaveAction(sdw.cleaner, sdw.destinationAlias, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.destinationAlias, err) } action.TabletType = topo.TYPE_SPARE // 5 - restart filtered replication on destination master log.Infof("Restarting filtered replication on master %v", sdw.shardInfo.MasterAlias) err = sdw.wr.ActionInitiator().StartBlp(sdw.shardInfo.MasterAlias, 30*time.Second) if err := sdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, sdw.shardInfo.MasterAlias.String()); err != nil { log.Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, sdw.shardInfo.MasterAlias.String(), err) } if err != nil { return err } return nil }