// synchronizeReplication phase: // 1 - ask the subset slave to stop replication // 2 - sleep for 5 seconds // 3 - ask the superset slave to stop replication // Note this is not 100% correct, but good enough for now func (worker *SQLDiffWorker) synchronizeReplication() error { worker.setState(SQLDiffSynchronizeReplication) // stop replication on subset slave worker.wr.Logger().Infof("Stopping replication on subset slave %v", worker.subset.alias) subsetTablet, err := worker.wr.TopoServer().GetTablet(worker.subset.alias) if err != nil { return err } ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) if err := worker.wr.TabletManagerClient().StopSlave(ctx, subsetTablet); err != nil { return fmt.Errorf("Cannot stop slave %v: %v", worker.subset.alias, err) } cancel() if worker.CheckInterrupted() { return topo.ErrInterrupted } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(worker.cleaner, subsetTablet, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.subset.alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", worker.subset.alias, err) } action.TabletType = topo.TYPE_SPARE // sleep for a few seconds time.Sleep(5 * time.Second) if worker.CheckInterrupted() { return topo.ErrInterrupted } // stop replication on superset slave worker.wr.Logger().Infof("Stopping replication on superset slave %v", worker.superset.alias) supersetTablet, err := worker.wr.TopoServer().GetTablet(worker.superset.alias) if err != nil { return err } ctx, cancel = context.WithTimeout(context.TODO(), 30*time.Second) if err := worker.wr.TabletManagerClient().StopSlave(ctx, supersetTablet); err != nil { return fmt.Errorf("Cannot stop slave %v: %v", worker.superset.alias, err) } cancel() // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(worker.cleaner, supersetTablet, 30*time.Second) action, err = wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.superset.alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", worker.superset.alias, err) } action.TabletType = topo.TYPE_SPARE return nil }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'checker' pointing back to us // - get the aliases of all the targets func (vscw *VerticalSplitCloneWorker) findTargets() error { vscw.setState(stateVSCFindTargets) // find an appropriate endpoint in the source shard var err error vscw.sourceAlias, err = findChecker(vscw.wr, vscw.cleaner, vscw.cell, vscw.sourceKeyspace, "0") if err != nil { return fmt.Errorf("cannot find checker for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err) } vscw.wr.Logger().Infof("Using tablet %v as the source", vscw.sourceAlias) // get the tablet info for it vscw.sourceTablet, err = vscw.wr.TopoServer().GetTablet(vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", vscw.sourceTablet, err) } // stop replication on it ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) if err := vscw.wr.TabletManagerClient().StopSlave(ctx, vscw.sourceTablet); err != nil { return fmt.Errorf("cannot stop replication on tablet %v", vscw.sourceAlias) } cancel() wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vscw.cleaner, vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vscw.sourceAlias, err) } action.TabletType = topo.TYPE_SPARE return vscw.findMasterTargets() }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'checker' pointing back to us // - get the aliases of all the targets func (scw *SplitCloneWorker) findTargets() error { scw.setState(stateSCFindTargets) var err error // find an appropriate endpoint in the source shards scw.sourceAliases = make([]topo.TabletAlias, len(scw.sourceShards)) for i, si := range scw.sourceShards { scw.sourceAliases[i], err = findChecker(scw.wr, scw.cleaner, scw.cell, si.Keyspace(), si.ShardName()) if err != nil { return fmt.Errorf("cannot find checker for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", scw.sourceAliases[i], si.Keyspace(), si.ShardName()) } // get the tablet info for them, and stop their replication scw.sourceTablets = make([]*topo.TabletInfo, len(scw.sourceAliases)) for i, alias := range scw.sourceAliases { scw.sourceTablets[i], err = scw.wr.TopoServer().GetTablet(alias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", alias, err) } ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) if err := scw.wr.TabletManagerClient().StopSlave(ctx, scw.sourceTablets[i]); err != nil { return fmt.Errorf("cannot stop replication on tablet %v", alias) } cancel() wrangler.RecordStartSlaveAction(scw.cleaner, scw.sourceTablets[i], 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(scw.cleaner, alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", alias, err) } action.TabletType = topo.TYPE_SPARE } return scw.findMasterTargets() }
func (vsdw *VerticalSplitDiffWorker) synchronizeReplication() error { vsdw.setState(stateVSDSynchronizeReplication) masterInfo, err := vsdw.wr.TopoServer().GetTablet(vsdw.shardInfo.MasterAlias) if err != nil { return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", vsdw.shardInfo.MasterAlias, err) } // 1 - stop the master binlog replication, get its current position vsdw.wr.Logger().Infof("Stopping master binlog replication on %v", vsdw.shardInfo.MasterAlias) ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) blpPositionList, err := vsdw.wr.TabletManagerClient().StopBlp(ctx, masterInfo) if err != nil { return fmt.Errorf("StopBlp on master %v failed: %v", vsdw.shardInfo.MasterAlias, err) } cancel() wrangler.RecordStartBlpAction(vsdw.cleaner, masterInfo, 30*time.Second) // 2 - stop the source 'checker' at a binlog position // higher than the destination master stopPositionList := blproto.BlpPositionList{ Entries: make([]blproto.BlpPosition, 1), } ss := vsdw.shardInfo.SourceShards[0] // find where we should be stopping pos, err := blpPositionList.FindBlpPositionById(ss.Uid) if err != nil { return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid) } // stop replication vsdw.wr.Logger().Infof("Stopping slave %v at a minimum of %v", vsdw.sourceAlias, pos.Position) sourceTablet, err := vsdw.wr.TopoServer().GetTablet(vsdw.sourceAlias) if err != nil { return err } stoppedAt, err := vsdw.wr.TabletManagerClient().StopSlaveMinimum(context.TODO(), sourceTablet, pos.Position, 30*time.Second) if err != nil { return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", vsdw.sourceAlias, pos.Position, err) } stopPositionList.Entries[0].Uid = ss.Uid stopPositionList.Entries[0].Position = stoppedAt.Position // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(vsdw.cleaner, sourceTablet, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vsdw.sourceAlias, err) } action.TabletType = topo.TYPE_SPARE // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions vsdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", vsdw.shardInfo.MasterAlias, stopPositionList) masterPos, err := vsdw.wr.TabletManagerClient().RunBlpUntil(context.TODO(), masterInfo, &stopPositionList, 30*time.Second) if err != nil { return fmt.Errorf("RunBlpUntil on %v until %v failed: %v", vsdw.shardInfo.MasterAlias, stopPositionList, err) } // 4 - wait until the destination checker is equal or passed // that master binlog position, and stop its replication. vsdw.wr.Logger().Infof("Waiting for destination checker %v to catch up to %v", vsdw.destinationAlias, masterPos) destinationTablet, err := vsdw.wr.TopoServer().GetTablet(vsdw.destinationAlias) if err != nil { return err } _, err = vsdw.wr.TabletManagerClient().StopSlaveMinimum(context.TODO(), destinationTablet, masterPos, 30*time.Second) if err != nil { return fmt.Errorf("StopSlaveMinimum on %v at %v failed: %v", vsdw.destinationAlias, masterPos, err) } wrangler.RecordStartSlaveAction(vsdw.cleaner, destinationTablet, 30*time.Second) action, err = wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vsdw.destinationAlias, err) } action.TabletType = topo.TYPE_SPARE // 5 - restart filtered replication on destination master vsdw.wr.Logger().Infof("Restarting filtered replication on master %v", vsdw.shardInfo.MasterAlias) ctx, cancel = context.WithTimeout(context.TODO(), 30*time.Second) err = vsdw.wr.TabletManagerClient().StartBlp(ctx, masterInfo) if err := vsdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, vsdw.shardInfo.MasterAlias.String()); err != nil { vsdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, vsdw.shardInfo.MasterAlias.String(), err) } cancel() if err != nil { return fmt.Errorf("StartBlp on %v failed: %v", vsdw.shardInfo.MasterAlias, err) } return nil }