// RunUntil will run all the players until they reach the given position. // Holds the map lock during that exercise, shouldn't take long at all. func (blm *BinlogPlayerMap) RunUntil(ctx context.Context, blpPositionList []*tabletmanagerdatapb.BlpPosition, waitTimeout time.Duration) error { // lock and check state blm.mu.Lock() defer blm.mu.Unlock() if blm.state != BpmStateStopped { return fmt.Errorf("RunUntil: player not stopped: %v", blm.state) } log.Infof("Starting map of binlog players until position") // find the exact stop position for all players, to be sure // we're not doing anything wrong posMap := make(map[uint32]string) for _, bpc := range blm.players { blpPos := tmutils.FindBlpPositionByID(blpPositionList, bpc.sourceShard.Uid) if blpPos == nil { return fmt.Errorf("No binlog position passed in for player Uid %v", bpc.sourceShard.Uid) } posMap[bpc.sourceShard.Uid] = blpPos.Position } // start all the players giving them where to stop for _, bpc := range blm.players { if err := bpc.StartUntil(ctx, posMap[bpc.sourceShard.Uid]); err != nil { return err } } // wait for all players to be stopped, or timeout wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, bpc := range blm.players { wg.Add(1) go func(bpc *BinlogPlayerController) { if err := bpc.WaitForStop(waitTimeout); err != nil { rec.RecordError(err) } wg.Done() }(bpc) } wg.Wait() return rec.Error() }
func (sdw *SplitDiffWorker) synchronizeReplication(ctx context.Context) error { sdw.SetState(WorkerStateSyncReplication) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) masterInfo, err := sdw.wr.TopoServer().GetTablet(shortCtx, sdw.shardInfo.MasterAlias) cancel() if err != nil { return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", sdw.shardInfo.MasterAlias, err) } // 1 - stop the master binlog replication, get its current position sdw.wr.Logger().Infof("Stopping master binlog replication on %v", sdw.shardInfo.MasterAlias) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) blpPositionList, err := sdw.wr.TabletManagerClient().StopBlp(shortCtx, masterInfo) cancel() if err != nil { return fmt.Errorf("StopBlp for %v failed: %v", sdw.shardInfo.MasterAlias, err) } wrangler.RecordStartBlpAction(sdw.cleaner, masterInfo) // 2 - stop all the source tablets at a binlog position // higher than the destination master stopPositionList := make([]*pbt.BlpPosition, len(sdw.shardInfo.SourceShards)) for i, ss := range sdw.shardInfo.SourceShards { // find where we should be stopping blpPos := tmutils.FindBlpPositionByID(blpPositionList, ss.Uid) if blpPos == nil { return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid) } // read the tablet shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sourceTablet, err := sdw.wr.TopoServer().GetTablet(shortCtx, sdw.sourceAliases[i]) cancel() if err != nil { return err } // stop replication sdw.wr.Logger().Infof("Stopping slave[%v] %v at a minimum of %v", i, sdw.sourceAliases[i], blpPos.Position) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) stoppedAt, err := sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, sourceTablet, blpPos.Position, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", sdw.sourceAliases[i], blpPos.Position, err) } stopPositionList[i] = &pbt.BlpPosition{ Uid: ss.Uid, Position: stoppedAt, } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(sdw.cleaner, sourceTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.sourceAliases[i]) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.sourceAliases[i], err) } action.TabletType = pb.TabletType_SPARE } // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions sdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", sdw.shardInfo.MasterAlias, stopPositionList) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) masterPos, err := sdw.wr.TabletManagerClient().RunBlpUntil(shortCtx, masterInfo, stopPositionList, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("RunBlpUntil for %v until %v failed: %v", sdw.shardInfo.MasterAlias, stopPositionList, err) } // 4 - wait until the destination tablet is equal or passed // that master binlog position, and stop its replication. sdw.wr.Logger().Infof("Waiting for destination tablet %v to catch up to %v", sdw.destinationAlias, masterPos) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) destinationTablet, err := sdw.wr.TopoServer().GetTablet(shortCtx, sdw.destinationAlias) cancel() if err != nil { return err } shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) _, err = sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, destinationTablet, masterPos, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("StopSlaveMinimum for %v at %v failed: %v", sdw.destinationAlias, masterPos, err) } wrangler.RecordStartSlaveAction(sdw.cleaner, destinationTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.destinationAlias, err) } action.TabletType = pb.TabletType_SPARE // 5 - restart filtered replication on destination master sdw.wr.Logger().Infof("Restarting filtered replication on master %v", sdw.shardInfo.MasterAlias) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = sdw.wr.TabletManagerClient().StartBlp(shortCtx, masterInfo) if err := sdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, topoproto.TabletAliasString(sdw.shardInfo.MasterAlias)); err != nil { sdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, topoproto.TabletAliasString(sdw.shardInfo.MasterAlias), err) } cancel() if err != nil { return fmt.Errorf("StartBlp failed for %v: %v", sdw.shardInfo.MasterAlias, err) } return nil }