func (wr *Wrangler) restartSlavesExternal(slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, scrapStragglers, continueOnUnexpectedMaster bool, acceptSuccessPercents int) error { recorder := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} swrd := actionnode.SlaveWasRestartedArgs{ Parent: masterElectTablet.Alias, ExpectedMasterAddr: masterElectTablet.GetMysqlAddr(), ExpectedMasterIpAddr: masterElectTablet.GetMysqlIpAddr(), ScrapStragglers: scrapStragglers, // Disabled for now // ContinueOnUnexpectedMaster: continueOnUnexpectedMaster, } // The following two blocks of actions are very likely to time // out for some tablets (one random guy is dead, the old // master is dead, ...). We execute them all in parallel until // we get to wr.actionTimeout(). After this, no other action // with a timeout is executed, so even if we got to the // timeout, we're still good. log.Infof("Making sure all tablets have the right master:") // do all the slaves for _, ti := range slaveTabletMap { wg.Add(1) go func(ti *topo.TabletInfo) { recorder.RecordError(wr.slaveWasRestarted(ti, &swrd)) wg.Done() }(ti) } // and do the old master and any straggler, if possible, but // do not record errors for these for _, ti := range masterTabletMap { wg.Add(1) go func(ti *topo.TabletInfo) { err := wr.slaveWasRestarted(ti, &swrd) if err != nil { // the old master can be annoying if left // around in the replication graph, so if we // can't restart it, we just scrap it. // We don't rebuild the Shard just yet though. log.Warningf("Old master %v is not restarting, scrapping it: %v", ti.Alias, err) if _, err := wr.Scrap(ti.Alias, true /*force*/, true /*skipRebuild*/); err != nil { log.Warningf("Failed to scrap old master %v: %v", ti.Alias, err) } } wg.Done() }(ti) } wg.Wait() if !recorder.HasErrors() { return nil } // report errors only above a threshold failurePercent := 100 * len(recorder.Errors) / (len(slaveTabletMap) + 1) if failurePercent < 100-acceptSuccessPercents { log.Warningf("Encountered %v%% failure, we keep going. Errors: %v", failurePercent, recorder.Error()) return nil } return recorder.Error() }