Exemplo n.º 1
0
// RunUntil will run all the players until they reach the given position.
// Holds the map lock during that exercise, shouldn't take long at all.
func (blm *BinlogPlayerMap) RunUntil(ctx context.Context, blpPositionList []*tabletmanagerdatapb.BlpPosition, waitTimeout time.Duration) error {
	// lock and check state
	blm.mu.Lock()
	defer blm.mu.Unlock()
	if blm.state != BpmStateStopped {
		return fmt.Errorf("RunUntil: player not stopped: %v", blm.state)
	}
	log.Infof("Starting map of binlog players until position")

	// find the exact stop position for all players, to be sure
	// we're not doing anything wrong
	posMap := make(map[uint32]string)
	for _, bpc := range blm.players {
		blpPos := tmutils.FindBlpPositionByID(blpPositionList, bpc.sourceShard.Uid)
		if blpPos == nil {
			return fmt.Errorf("No binlog position passed in for player Uid %v", bpc.sourceShard.Uid)
		}
		posMap[bpc.sourceShard.Uid] = blpPos.Position
	}

	// start all the players giving them where to stop
	for _, bpc := range blm.players {
		if err := bpc.StartUntil(ctx, posMap[bpc.sourceShard.Uid]); err != nil {
			return err
		}
	}

	// wait for all players to be stopped, or timeout
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, bpc := range blm.players {
		wg.Add(1)
		go func(bpc *BinlogPlayerController) {
			if err := bpc.WaitForStop(waitTimeout); err != nil {
				rec.RecordError(err)
			}
			wg.Done()
		}(bpc)
	}
	wg.Wait()

	return rec.Error()
}
Exemplo n.º 2
0
func (sdw *SplitDiffWorker) synchronizeReplication(ctx context.Context) error {
	sdw.SetState(WorkerStateSyncReplication)

	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	masterInfo, err := sdw.wr.TopoServer().GetTablet(shortCtx, sdw.shardInfo.MasterAlias)
	cancel()
	if err != nil {
		return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", sdw.shardInfo.MasterAlias, err)
	}

	// 1 - stop the master binlog replication, get its current position
	sdw.wr.Logger().Infof("Stopping master binlog replication on %v", sdw.shardInfo.MasterAlias)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	blpPositionList, err := sdw.wr.TabletManagerClient().StopBlp(shortCtx, masterInfo)
	cancel()
	if err != nil {
		return fmt.Errorf("StopBlp for %v failed: %v", sdw.shardInfo.MasterAlias, err)
	}
	wrangler.RecordStartBlpAction(sdw.cleaner, masterInfo)

	// 2 - stop all the source tablets at a binlog position
	//     higher than the destination master
	stopPositionList := make([]*pbt.BlpPosition, len(sdw.shardInfo.SourceShards))
	for i, ss := range sdw.shardInfo.SourceShards {
		// find where we should be stopping
		blpPos := tmutils.FindBlpPositionByID(blpPositionList, ss.Uid)
		if blpPos == nil {
			return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid)
		}

		// read the tablet
		shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
		sourceTablet, err := sdw.wr.TopoServer().GetTablet(shortCtx, sdw.sourceAliases[i])
		cancel()
		if err != nil {
			return err
		}

		// stop replication
		sdw.wr.Logger().Infof("Stopping slave[%v] %v at a minimum of %v", i, sdw.sourceAliases[i], blpPos.Position)
		shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
		stoppedAt, err := sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, sourceTablet, blpPos.Position, *remoteActionsTimeout)
		cancel()
		if err != nil {
			return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", sdw.sourceAliases[i], blpPos.Position, err)
		}
		stopPositionList[i] = &pbt.BlpPosition{
			Uid:      ss.Uid,
			Position: stoppedAt,
		}

		// change the cleaner actions from ChangeSlaveType(rdonly)
		// to StartSlave() + ChangeSlaveType(spare)
		wrangler.RecordStartSlaveAction(sdw.cleaner, sourceTablet)
		action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.sourceAliases[i])
		if err != nil {
			return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.sourceAliases[i], err)
		}
		action.TabletType = pb.TabletType_SPARE
	}

	// 3 - ask the master of the destination shard to resume filtered
	//     replication up to the new list of positions
	sdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", sdw.shardInfo.MasterAlias, stopPositionList)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	masterPos, err := sdw.wr.TabletManagerClient().RunBlpUntil(shortCtx, masterInfo, stopPositionList, *remoteActionsTimeout)
	cancel()
	if err != nil {
		return fmt.Errorf("RunBlpUntil for %v until %v failed: %v", sdw.shardInfo.MasterAlias, stopPositionList, err)
	}

	// 4 - wait until the destination tablet is equal or passed
	//     that master binlog position, and stop its replication.
	sdw.wr.Logger().Infof("Waiting for destination tablet %v to catch up to %v", sdw.destinationAlias, masterPos)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	destinationTablet, err := sdw.wr.TopoServer().GetTablet(shortCtx, sdw.destinationAlias)
	cancel()
	if err != nil {
		return err
	}
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	_, err = sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, destinationTablet, masterPos, *remoteActionsTimeout)
	cancel()
	if err != nil {
		return fmt.Errorf("StopSlaveMinimum for %v at %v failed: %v", sdw.destinationAlias, masterPos, err)
	}
	wrangler.RecordStartSlaveAction(sdw.cleaner, destinationTablet)
	action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.destinationAlias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.destinationAlias, err)
	}
	action.TabletType = pb.TabletType_SPARE

	// 5 - restart filtered replication on destination master
	sdw.wr.Logger().Infof("Restarting filtered replication on master %v", sdw.shardInfo.MasterAlias)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	err = sdw.wr.TabletManagerClient().StartBlp(shortCtx, masterInfo)
	if err := sdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, topoproto.TabletAliasString(sdw.shardInfo.MasterAlias)); err != nil {
		sdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, topoproto.TabletAliasString(sdw.shardInfo.MasterAlias), err)
	}
	cancel()
	if err != nil {
		return fmt.Errorf("StartBlp failed for %v: %v", sdw.shardInfo.MasterAlias, err)
	}

	return nil
}