Exemple #1
0
func (sdw *SplitDiffWorker) synchronizeReplication(ctx context.Context) error {
	sdw.SetState(WorkerStateSyncReplication)

	masterInfo, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.shardInfo.MasterAlias)
	if err != nil {
		return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", sdw.shardInfo.MasterAlias, err)
	}

	// 1 - stop the master binlog replication, get its current position
	sdw.wr.Logger().Infof("Stopping master binlog replication on %v", sdw.shardInfo.MasterAlias)
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	blpPositionList, err := sdw.wr.TabletManagerClient().StopBlp(shortCtx, masterInfo)
	cancel()
	if err != nil {
		return fmt.Errorf("StopBlp for %v failed: %v", sdw.shardInfo.MasterAlias, err)
	}
	wrangler.RecordStartBlpAction(sdw.cleaner, masterInfo)

	// 2 - stop all the source tablets at a binlog position
	//     higher than the destination master
	stopPositionList := blproto.BlpPositionList{
		Entries: make([]blproto.BlpPosition, len(sdw.shardInfo.SourceShards)),
	}
	for i, ss := range sdw.shardInfo.SourceShards {
		// find where we should be stopping
		blpPos, err := blpPositionList.FindBlpPositionById(ss.Uid)
		if err != nil {
			return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid)
		}

		// read the tablet
		sourceTablet, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.sourceAliases[i])
		if err != nil {
			return err
		}

		// stop replication
		sdw.wr.Logger().Infof("Stopping slave[%v] %v at a minimum of %v", i, sdw.sourceAliases[i], blpPos.Position)
		shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
		stoppedAt, err := sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, sourceTablet, blpPos.Position, *remoteActionsTimeout)
		cancel()
		if err != nil {
			return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", sdw.sourceAliases[i], blpPos.Position, err)
		}
		stopPositionList.Entries[i].Uid = ss.Uid
		stopPositionList.Entries[i].Position = stoppedAt

		// change the cleaner actions from ChangeSlaveType(rdonly)
		// to StartSlave() + ChangeSlaveType(spare)
		wrangler.RecordStartSlaveAction(sdw.cleaner, sourceTablet)
		action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.sourceAliases[i])
		if err != nil {
			return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.sourceAliases[i], err)
		}
		action.TabletType = pb.TabletType_SPARE
	}

	// 3 - ask the master of the destination shard to resume filtered
	//     replication up to the new list of positions
	sdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", sdw.shardInfo.MasterAlias, stopPositionList)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	masterPos, err := sdw.wr.TabletManagerClient().RunBlpUntil(shortCtx, masterInfo, &stopPositionList, *remoteActionsTimeout)
	cancel()
	if err != nil {
		return fmt.Errorf("RunBlpUntil for %v until %v failed: %v", sdw.shardInfo.MasterAlias, stopPositionList, err)
	}

	// 4 - wait until the destination tablet is equal or passed
	//     that master binlog position, and stop its replication.
	sdw.wr.Logger().Infof("Waiting for destination tablet %v to catch up to %v", sdw.destinationAlias, masterPos)
	destinationTablet, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.destinationAlias)
	if err != nil {
		return err
	}
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	_, err = sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, destinationTablet, masterPos, *remoteActionsTimeout)
	cancel()
	if err != nil {
		return fmt.Errorf("StopSlaveMinimum for %v at %v failed: %v", sdw.destinationAlias, masterPos, err)
	}
	wrangler.RecordStartSlaveAction(sdw.cleaner, destinationTablet)
	action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.destinationAlias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.destinationAlias, err)
	}
	action.TabletType = pb.TabletType_SPARE

	// 5 - restart filtered replication on destination master
	sdw.wr.Logger().Infof("Restarting filtered replication on master %v", sdw.shardInfo.MasterAlias)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	err = sdw.wr.TabletManagerClient().StartBlp(shortCtx, masterInfo)
	if err := sdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, topo.TabletAliasString(sdw.shardInfo.MasterAlias)); err != nil {
		sdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, topo.TabletAliasString(sdw.shardInfo.MasterAlias), err)
	}
	cancel()
	if err != nil {
		return fmt.Errorf("StartBlp failed for %v: %v", sdw.shardInfo.MasterAlias, err)
	}

	return nil
}
func (vsdw *VerticalSplitDiffWorker) synchronizeReplication(ctx context.Context) error {
	vsdw.SetState(WorkerStateSyncReplication)

	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	masterInfo, err := vsdw.wr.TopoServer().GetTablet(shortCtx, vsdw.shardInfo.MasterAlias)
	cancel()
	if err != nil {
		return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err)
	}

	// 1 - stop the master binlog replication, get its current position
	vsdw.wr.Logger().Infof("Stopping master binlog replication on %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias))
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	blpPositionList, err := vsdw.wr.TabletManagerClient().StopBlp(shortCtx, masterInfo)
	cancel()
	if err != nil {
		return fmt.Errorf("StopBlp on master %v failed: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err)
	}
	wrangler.RecordStartBlpAction(vsdw.cleaner, masterInfo)

	// 2 - stop the source tablet at a binlog position
	//     higher than the destination master
	stopPositionList := make([]*tabletmanagerdatapb.BlpPosition, 1)
	ss := vsdw.shardInfo.SourceShards[0]
	// find where we should be stopping
	blpPos := tmutils.FindBlpPositionByID(blpPositionList, ss.Uid)
	if blpPos == nil {
		return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid)
	}

	// stop replication
	vsdw.wr.Logger().Infof("Stopping slave %v at a minimum of %v", topoproto.TabletAliasString(vsdw.sourceAlias), blpPos.Position)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	sourceTablet, err := vsdw.wr.TopoServer().GetTablet(shortCtx, vsdw.sourceAlias)
	cancel()
	if err != nil {
		return err
	}
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	stoppedAt, err := vsdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, sourceTablet, blpPos.Position, *remoteActionsTimeout)
	cancel()
	if err != nil {
		return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", topoproto.TabletAliasString(vsdw.sourceAlias), blpPos.Position, err)
	}
	stopPositionList[0] = &tabletmanagerdatapb.BlpPosition{
		Uid:      ss.Uid,
		Position: stoppedAt,
	}

	// change the cleaner actions from ChangeSlaveType(rdonly)
	// to StartSlave() + ChangeSlaveType(spare)
	wrangler.RecordStartSlaveAction(vsdw.cleaner, sourceTablet)
	action, err := wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.sourceAlias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topoproto.TabletAliasString(vsdw.sourceAlias), err)
	}
	action.TabletType = topodatapb.TabletType_SPARE

	// 3 - ask the master of the destination shard to resume filtered
	//     replication up to the new list of positions
	vsdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), stopPositionList)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	masterPos, err := vsdw.wr.TabletManagerClient().RunBlpUntil(shortCtx, masterInfo, stopPositionList, *remoteActionsTimeout)
	cancel()
	if err != nil {
		return fmt.Errorf("RunBlpUntil on %v until %v failed: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), stopPositionList, err)
	}

	// 4 - wait until the destination tablet is equal or passed
	//     that master binlog position, and stop its replication.
	vsdw.wr.Logger().Infof("Waiting for destination tablet %v to catch up to %v", topoproto.TabletAliasString(vsdw.destinationAlias), masterPos)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	destinationTablet, err := vsdw.wr.TopoServer().GetTablet(shortCtx, vsdw.destinationAlias)
	cancel()
	if err != nil {
		return err
	}
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	_, err = vsdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, destinationTablet, masterPos, *remoteActionsTimeout)
	cancel()
	if err != nil {
		return fmt.Errorf("StopSlaveMinimum on %v at %v failed: %v", topoproto.TabletAliasString(vsdw.destinationAlias), masterPos, err)
	}
	wrangler.RecordStartSlaveAction(vsdw.cleaner, destinationTablet)
	action, err = wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.destinationAlias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topoproto.TabletAliasString(vsdw.destinationAlias), err)
	}
	action.TabletType = topodatapb.TabletType_SPARE

	// 5 - restart filtered replication on destination master
	vsdw.wr.Logger().Infof("Restarting filtered replication on master %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias))
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	err = vsdw.wr.TabletManagerClient().StartBlp(shortCtx, masterInfo)
	if err := vsdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias)); err != nil {
		vsdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err)
	}
	cancel()
	if err != nil {
		return fmt.Errorf("StartBlp on %v failed: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err)
	}

	return nil
}
func (vsdw *VerticalSplitDiffWorker) synchronizeReplication() error {
	vsdw.setState(stateVSDSynchronizeReplication)

	// 1 - stop the master binlog replication, get its current position
	vsdw.wr.Logger().Infof("Stopping master binlog replication on %v", vsdw.shardInfo.MasterAlias)
	blpPositionList, err := vsdw.wr.ActionInitiator().StopBlp(vsdw.shardInfo.MasterAlias, 30*time.Second)
	if err != nil {
		return fmt.Errorf("StopBlp on master %v failed: %v", vsdw.shardInfo.MasterAlias, err)
	}
	wrangler.RecordStartBlpAction(vsdw.cleaner, vsdw.shardInfo.MasterAlias, 30*time.Second)

	// 2 - stop the source 'checker' at a binlog position
	//     higher than the destination master
	stopPositionList := blproto.BlpPositionList{
		Entries: make([]blproto.BlpPosition, 1),
	}
	ss := vsdw.shardInfo.SourceShards[0]
	// find where we should be stopping
	pos, err := blpPositionList.FindBlpPositionById(ss.Uid)
	if err != nil {
		return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid)
	}

	// stop replication
	vsdw.wr.Logger().Infof("Stopping slave %v at a minimum of %v", vsdw.sourceAlias, pos.Position)
	stoppedAt, err := vsdw.wr.ActionInitiator().StopSlaveMinimum(vsdw.sourceAlias, pos.Position, 30*time.Second)
	if err != nil {
		return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", vsdw.sourceAlias, pos.Position, err)
	}
	stopPositionList.Entries[0].Uid = ss.Uid
	stopPositionList.Entries[0].Position = stoppedAt.Position

	// change the cleaner actions from ChangeSlaveType(rdonly)
	// to StartSlave() + ChangeSlaveType(spare)
	wrangler.RecordStartSlaveAction(vsdw.cleaner, vsdw.sourceAlias, 30*time.Second)
	action, err := wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.sourceAlias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vsdw.sourceAlias, err)
	}
	action.TabletType = topo.TYPE_SPARE

	// 3 - ask the master of the destination shard to resume filtered
	//     replication up to the new list of positions
	vsdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", vsdw.shardInfo.MasterAlias, stopPositionList)
	masterPos, err := vsdw.wr.ActionInitiator().RunBlpUntil(vsdw.shardInfo.MasterAlias, &stopPositionList, 30*time.Second)
	if err != nil {
		return fmt.Errorf("RunBlpUntil on %v until %v failed: %v", vsdw.shardInfo.MasterAlias, stopPositionList, err)
	}

	// 4 - wait until the destination checker is equal or passed
	//     that master binlog position, and stop its replication.
	vsdw.wr.Logger().Infof("Waiting for destination checker %v to catch up to %v", vsdw.destinationAlias, masterPos)
	_, err = vsdw.wr.ActionInitiator().StopSlaveMinimum(vsdw.destinationAlias, masterPos, 30*time.Second)
	if err != nil {
		return fmt.Errorf("StopSlaveMinimum on %v at %v failed: %v", vsdw.destinationAlias, masterPos, err)
	}
	wrangler.RecordStartSlaveAction(vsdw.cleaner, vsdw.destinationAlias, 30*time.Second)
	action, err = wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.destinationAlias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vsdw.destinationAlias, err)
	}
	action.TabletType = topo.TYPE_SPARE

	// 5 - restart filtered replication on destination master
	vsdw.wr.Logger().Infof("Restarting filtered replication on master %v", vsdw.shardInfo.MasterAlias)
	err = vsdw.wr.ActionInitiator().StartBlp(vsdw.shardInfo.MasterAlias, 30*time.Second)
	if err := vsdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, vsdw.shardInfo.MasterAlias.String()); err != nil {
		vsdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, vsdw.shardInfo.MasterAlias.String(), err)
	}
	if err != nil {
		return fmt.Errorf("StartBlp on %v failed: %v", vsdw.shardInfo.MasterAlias, err)
	}

	return nil
}
Exemple #4
0
func (sdw *SplitDiffWorker) synchronizeReplication() error {
	sdw.setState(stateSDSynchronizeReplication)

	// 1 - stop the master binlog replication, get its current position
	log.Infof("Stopping master binlog replication on %v", sdw.shardInfo.MasterAlias)
	blpPositionList, err := sdw.wr.ActionInitiator().StopBlp(sdw.shardInfo.MasterAlias, 30*time.Second)
	if err != nil {
		return err
	}
	wrangler.RecordStartBlpAction(sdw.cleaner, sdw.shardInfo.MasterAlias, 30*time.Second)

	// 2 - stop all the source 'checker' at a binlog position
	//     higher than the destination master
	stopPositionList := myproto.BlpPositionList{
		Entries: make([]myproto.BlpPosition, len(sdw.shardInfo.SourceShards)),
	}
	for i, ss := range sdw.shardInfo.SourceShards {
		// find where we should be stopping
		pos, err := blpPositionList.FindBlpPositionById(ss.Uid)
		if err != nil {
			return fmt.Errorf("No binlog position on the master for Uid %v", ss.Uid)
		}

		// stop replication
		log.Infof("Stopping slave[%v] %v at a minimum of %v", i, sdw.sourceAliases[i], pos.GroupId)
		stoppedAt, err := sdw.wr.ActionInitiator().StopSlaveMinimum(sdw.sourceAliases[i], pos.GroupId, 30*time.Second)
		if err != nil {
			return fmt.Errorf("Cannot stop slave %v at right binlog position %v: %v", sdw.sourceAliases[i], pos.GroupId, err)
		}
		stopPositionList.Entries[i].Uid = ss.Uid
		stopPositionList.Entries[i].GroupId = stoppedAt.MasterLogGroupId

		// change the cleaner actions from ChangeSlaveType(rdonly)
		// to StartSlave() + ChangeSlaveType(spare)
		wrangler.RecordStartSlaveAction(sdw.cleaner, sdw.sourceAliases[i], 30*time.Second)
		action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.sourceAliases[i])
		if err != nil {
			return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.sourceAliases[i], err)
		}
		action.TabletType = topo.TYPE_SPARE
	}

	// 3 - ask the master of the destination shard to resume filtered
	//     replication up to the new list of positions
	log.Infof("Restarting master %v until it catches up to %v", sdw.shardInfo.MasterAlias, stopPositionList)
	masterPos, err := sdw.wr.ActionInitiator().RunBlpUntil(sdw.shardInfo.MasterAlias, &stopPositionList, 30*time.Second)
	if err != nil {
		return err
	}

	// 4 - wait until the destination checker is equal or passed
	//     that master binlog position, and stop its replication.
	log.Infof("Waiting for destination checker %v to catch up to %v", sdw.destinationAlias, masterPos.MasterLogGroupId)
	_, err = sdw.wr.ActionInitiator().StopSlaveMinimum(sdw.destinationAlias, masterPos.MasterLogGroupId, 30*time.Second)
	if err != nil {
		return err
	}
	wrangler.RecordStartSlaveAction(sdw.cleaner, sdw.destinationAlias, 30*time.Second)
	action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.destinationAlias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.destinationAlias, err)
	}
	action.TabletType = topo.TYPE_SPARE

	// 5 - restart filtered replication on destination master
	log.Infof("Restarting filtered replication on master %v", sdw.shardInfo.MasterAlias)
	err = sdw.wr.ActionInitiator().StartBlp(sdw.shardInfo.MasterAlias, 30*time.Second)
	if err := sdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, sdw.shardInfo.MasterAlias.String()); err != nil {
		log.Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, sdw.shardInfo.MasterAlias.String(), err)
	}
	if err != nil {
		return err
	}

	return nil
}