func (sdw *SplitDiffWorker) synchronizeReplication(ctx context.Context) error { sdw.SetState(WorkerStateSyncReplication) masterInfo, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.shardInfo.MasterAlias) if err != nil { return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", sdw.shardInfo.MasterAlias, err) } // 1 - stop the master binlog replication, get its current position sdw.wr.Logger().Infof("Stopping master binlog replication on %v", sdw.shardInfo.MasterAlias) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) blpPositionList, err := sdw.wr.TabletManagerClient().StopBlp(shortCtx, masterInfo) cancel() if err != nil { return fmt.Errorf("StopBlp for %v failed: %v", sdw.shardInfo.MasterAlias, err) } wrangler.RecordStartBlpAction(sdw.cleaner, masterInfo) // 2 - stop all the source tablets at a binlog position // higher than the destination master stopPositionList := blproto.BlpPositionList{ Entries: make([]blproto.BlpPosition, len(sdw.shardInfo.SourceShards)), } for i, ss := range sdw.shardInfo.SourceShards { // find where we should be stopping blpPos, err := blpPositionList.FindBlpPositionById(ss.Uid) if err != nil { return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid) } // read the tablet sourceTablet, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.sourceAliases[i]) if err != nil { return err } // stop replication sdw.wr.Logger().Infof("Stopping slave[%v] %v at a minimum of %v", i, sdw.sourceAliases[i], blpPos.Position) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) stoppedAt, err := sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, sourceTablet, blpPos.Position, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", sdw.sourceAliases[i], blpPos.Position, err) } stopPositionList.Entries[i].Uid = ss.Uid stopPositionList.Entries[i].Position = stoppedAt // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(sdw.cleaner, sourceTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.sourceAliases[i]) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.sourceAliases[i], err) } action.TabletType = pb.TabletType_SPARE } // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions sdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", sdw.shardInfo.MasterAlias, stopPositionList) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) masterPos, err := sdw.wr.TabletManagerClient().RunBlpUntil(shortCtx, masterInfo, &stopPositionList, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("RunBlpUntil for %v until %v failed: %v", sdw.shardInfo.MasterAlias, stopPositionList, err) } // 4 - wait until the destination tablet is equal or passed // that master binlog position, and stop its replication. sdw.wr.Logger().Infof("Waiting for destination tablet %v to catch up to %v", sdw.destinationAlias, masterPos) destinationTablet, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.destinationAlias) if err != nil { return err } shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) _, err = sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, destinationTablet, masterPos, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("StopSlaveMinimum for %v at %v failed: %v", sdw.destinationAlias, masterPos, err) } wrangler.RecordStartSlaveAction(sdw.cleaner, destinationTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.destinationAlias, err) } action.TabletType = pb.TabletType_SPARE // 5 - restart filtered replication on destination master sdw.wr.Logger().Infof("Restarting filtered replication on master %v", sdw.shardInfo.MasterAlias) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = sdw.wr.TabletManagerClient().StartBlp(shortCtx, masterInfo) if err := sdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, topo.TabletAliasString(sdw.shardInfo.MasterAlias)); err != nil { sdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, topo.TabletAliasString(sdw.shardInfo.MasterAlias), err) } cancel() if err != nil { return fmt.Errorf("StartBlp failed for %v: %v", sdw.shardInfo.MasterAlias, err) } return nil }
func (vsdw *VerticalSplitDiffWorker) synchronizeReplication(ctx context.Context) error { vsdw.SetState(WorkerStateSyncReplication) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) masterInfo, err := vsdw.wr.TopoServer().GetTablet(shortCtx, vsdw.shardInfo.MasterAlias) cancel() if err != nil { return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err) } // 1 - stop the master binlog replication, get its current position vsdw.wr.Logger().Infof("Stopping master binlog replication on %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias)) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) blpPositionList, err := vsdw.wr.TabletManagerClient().StopBlp(shortCtx, masterInfo) cancel() if err != nil { return fmt.Errorf("StopBlp on master %v failed: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err) } wrangler.RecordStartBlpAction(vsdw.cleaner, masterInfo) // 2 - stop the source tablet at a binlog position // higher than the destination master stopPositionList := make([]*tabletmanagerdatapb.BlpPosition, 1) ss := vsdw.shardInfo.SourceShards[0] // find where we should be stopping blpPos := tmutils.FindBlpPositionByID(blpPositionList, ss.Uid) if blpPos == nil { return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid) } // stop replication vsdw.wr.Logger().Infof("Stopping slave %v at a minimum of %v", topoproto.TabletAliasString(vsdw.sourceAlias), blpPos.Position) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) sourceTablet, err := vsdw.wr.TopoServer().GetTablet(shortCtx, vsdw.sourceAlias) cancel() if err != nil { return err } shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) stoppedAt, err := vsdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, sourceTablet, blpPos.Position, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", topoproto.TabletAliasString(vsdw.sourceAlias), blpPos.Position, err) } stopPositionList[0] = &tabletmanagerdatapb.BlpPosition{ Uid: ss.Uid, Position: stoppedAt, } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(vsdw.cleaner, sourceTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topoproto.TabletAliasString(vsdw.sourceAlias), err) } action.TabletType = topodatapb.TabletType_SPARE // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions vsdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), stopPositionList) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) masterPos, err := vsdw.wr.TabletManagerClient().RunBlpUntil(shortCtx, masterInfo, stopPositionList, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("RunBlpUntil on %v until %v failed: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), stopPositionList, err) } // 4 - wait until the destination tablet is equal or passed // that master binlog position, and stop its replication. vsdw.wr.Logger().Infof("Waiting for destination tablet %v to catch up to %v", topoproto.TabletAliasString(vsdw.destinationAlias), masterPos) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) destinationTablet, err := vsdw.wr.TopoServer().GetTablet(shortCtx, vsdw.destinationAlias) cancel() if err != nil { return err } shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) _, err = vsdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, destinationTablet, masterPos, *remoteActionsTimeout) cancel() if err != nil { return fmt.Errorf("StopSlaveMinimum on %v at %v failed: %v", topoproto.TabletAliasString(vsdw.destinationAlias), masterPos, err) } wrangler.RecordStartSlaveAction(vsdw.cleaner, destinationTablet) action, err = wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topoproto.TabletAliasString(vsdw.destinationAlias), err) } action.TabletType = topodatapb.TabletType_SPARE // 5 - restart filtered replication on destination master vsdw.wr.Logger().Infof("Restarting filtered replication on master %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias)) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = vsdw.wr.TabletManagerClient().StartBlp(shortCtx, masterInfo) if err := vsdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias)); err != nil { vsdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err) } cancel() if err != nil { return fmt.Errorf("StartBlp on %v failed: %v", topoproto.TabletAliasString(vsdw.shardInfo.MasterAlias), err) } return nil }
func (vsdw *VerticalSplitDiffWorker) synchronizeReplication() error { vsdw.setState(stateVSDSynchronizeReplication) // 1 - stop the master binlog replication, get its current position vsdw.wr.Logger().Infof("Stopping master binlog replication on %v", vsdw.shardInfo.MasterAlias) blpPositionList, err := vsdw.wr.ActionInitiator().StopBlp(vsdw.shardInfo.MasterAlias, 30*time.Second) if err != nil { return fmt.Errorf("StopBlp on master %v failed: %v", vsdw.shardInfo.MasterAlias, err) } wrangler.RecordStartBlpAction(vsdw.cleaner, vsdw.shardInfo.MasterAlias, 30*time.Second) // 2 - stop the source 'checker' at a binlog position // higher than the destination master stopPositionList := blproto.BlpPositionList{ Entries: make([]blproto.BlpPosition, 1), } ss := vsdw.shardInfo.SourceShards[0] // find where we should be stopping pos, err := blpPositionList.FindBlpPositionById(ss.Uid) if err != nil { return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid) } // stop replication vsdw.wr.Logger().Infof("Stopping slave %v at a minimum of %v", vsdw.sourceAlias, pos.Position) stoppedAt, err := vsdw.wr.ActionInitiator().StopSlaveMinimum(vsdw.sourceAlias, pos.Position, 30*time.Second) if err != nil { return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", vsdw.sourceAlias, pos.Position, err) } stopPositionList.Entries[0].Uid = ss.Uid stopPositionList.Entries[0].Position = stoppedAt.Position // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(vsdw.cleaner, vsdw.sourceAlias, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vsdw.sourceAlias, err) } action.TabletType = topo.TYPE_SPARE // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions vsdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", vsdw.shardInfo.MasterAlias, stopPositionList) masterPos, err := vsdw.wr.ActionInitiator().RunBlpUntil(vsdw.shardInfo.MasterAlias, &stopPositionList, 30*time.Second) if err != nil { return fmt.Errorf("RunBlpUntil on %v until %v failed: %v", vsdw.shardInfo.MasterAlias, stopPositionList, err) } // 4 - wait until the destination checker is equal or passed // that master binlog position, and stop its replication. vsdw.wr.Logger().Infof("Waiting for destination checker %v to catch up to %v", vsdw.destinationAlias, masterPos) _, err = vsdw.wr.ActionInitiator().StopSlaveMinimum(vsdw.destinationAlias, masterPos, 30*time.Second) if err != nil { return fmt.Errorf("StopSlaveMinimum on %v at %v failed: %v", vsdw.destinationAlias, masterPos, err) } wrangler.RecordStartSlaveAction(vsdw.cleaner, vsdw.destinationAlias, 30*time.Second) action, err = wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vsdw.destinationAlias, err) } action.TabletType = topo.TYPE_SPARE // 5 - restart filtered replication on destination master vsdw.wr.Logger().Infof("Restarting filtered replication on master %v", vsdw.shardInfo.MasterAlias) err = vsdw.wr.ActionInitiator().StartBlp(vsdw.shardInfo.MasterAlias, 30*time.Second) if err := vsdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, vsdw.shardInfo.MasterAlias.String()); err != nil { vsdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, vsdw.shardInfo.MasterAlias.String(), err) } if err != nil { return fmt.Errorf("StartBlp on %v failed: %v", vsdw.shardInfo.MasterAlias, err) } return nil }
func (sdw *SplitDiffWorker) synchronizeReplication() error { sdw.setState(stateSDSynchronizeReplication) // 1 - stop the master binlog replication, get its current position log.Infof("Stopping master binlog replication on %v", sdw.shardInfo.MasterAlias) blpPositionList, err := sdw.wr.ActionInitiator().StopBlp(sdw.shardInfo.MasterAlias, 30*time.Second) if err != nil { return err } wrangler.RecordStartBlpAction(sdw.cleaner, sdw.shardInfo.MasterAlias, 30*time.Second) // 2 - stop all the source 'checker' at a binlog position // higher than the destination master stopPositionList := myproto.BlpPositionList{ Entries: make([]myproto.BlpPosition, len(sdw.shardInfo.SourceShards)), } for i, ss := range sdw.shardInfo.SourceShards { // find where we should be stopping pos, err := blpPositionList.FindBlpPositionById(ss.Uid) if err != nil { return fmt.Errorf("No binlog position on the master for Uid %v", ss.Uid) } // stop replication log.Infof("Stopping slave[%v] %v at a minimum of %v", i, sdw.sourceAliases[i], pos.GroupId) stoppedAt, err := sdw.wr.ActionInitiator().StopSlaveMinimum(sdw.sourceAliases[i], pos.GroupId, 30*time.Second) if err != nil { return fmt.Errorf("Cannot stop slave %v at right binlog position %v: %v", sdw.sourceAliases[i], pos.GroupId, err) } stopPositionList.Entries[i].Uid = ss.Uid stopPositionList.Entries[i].GroupId = stoppedAt.MasterLogGroupId // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(sdw.cleaner, sdw.sourceAliases[i], 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.sourceAliases[i]) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.sourceAliases[i], err) } action.TabletType = topo.TYPE_SPARE } // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions log.Infof("Restarting master %v until it catches up to %v", sdw.shardInfo.MasterAlias, stopPositionList) masterPos, err := sdw.wr.ActionInitiator().RunBlpUntil(sdw.shardInfo.MasterAlias, &stopPositionList, 30*time.Second) if err != nil { return err } // 4 - wait until the destination checker is equal or passed // that master binlog position, and stop its replication. log.Infof("Waiting for destination checker %v to catch up to %v", sdw.destinationAlias, masterPos.MasterLogGroupId) _, err = sdw.wr.ActionInitiator().StopSlaveMinimum(sdw.destinationAlias, masterPos.MasterLogGroupId, 30*time.Second) if err != nil { return err } wrangler.RecordStartSlaveAction(sdw.cleaner, sdw.destinationAlias, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.destinationAlias, err) } action.TabletType = topo.TYPE_SPARE // 5 - restart filtered replication on destination master log.Infof("Restarting filtered replication on master %v", sdw.shardInfo.MasterAlias) err = sdw.wr.ActionInitiator().StartBlp(sdw.shardInfo.MasterAlias, 30*time.Second) if err := sdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, sdw.shardInfo.MasterAlias.String()); err != nil { log.Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, sdw.shardInfo.MasterAlias.String(), err) } if err != nil { return err } return nil }