Example #1
0
func (wr *Wrangler) restartSlavesExternal(slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, scrapStragglers, continueOnUnexpectedMaster bool, acceptSuccessPercents int) error {
	recorder := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	swrd := actionnode.SlaveWasRestartedArgs{
		Parent:               masterElectTablet.Alias,
		ExpectedMasterAddr:   masterElectTablet.GetMysqlAddr(),
		ExpectedMasterIpAddr: masterElectTablet.GetMysqlIpAddr(),
		ScrapStragglers:      scrapStragglers,
		// Disabled for now
		// ContinueOnUnexpectedMaster: continueOnUnexpectedMaster,
	}

	// The following two blocks of actions are very likely to time
	// out for some tablets (one random guy is dead, the old
	// master is dead, ...). We execute them all in parallel until
	// we get to wr.actionTimeout(). After this, no other action
	// with a timeout is executed, so even if we got to the
	// timeout, we're still good.
	log.Infof("Making sure all tablets have the right master:")

	// do all the slaves
	for _, ti := range slaveTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			recorder.RecordError(wr.slaveWasRestarted(ti, &swrd))
			wg.Done()
		}(ti)
	}

	// and do the old master and any straggler, if possible, but
	// do not record errors for these
	for _, ti := range masterTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			err := wr.slaveWasRestarted(ti, &swrd)
			if err != nil {
				// the old master can be annoying if left
				// around in the replication graph, so if we
				// can't restart it, we just scrap it.
				// We don't rebuild the Shard just yet though.
				log.Warningf("Old master %v is not restarting, scrapping it: %v", ti.Alias, err)
				if _, err := wr.Scrap(ti.Alias, true /*force*/, true /*skipRebuild*/); err != nil {
					log.Warningf("Failed to scrap old master %v: %v", ti.Alias, err)
				}
			}
			wg.Done()
		}(ti)
	}
	wg.Wait()

	if !recorder.HasErrors() {
		return nil
	}

	// report errors only above a threshold
	failurePercent := 100 * len(recorder.Errors) / (len(slaveTabletMap) + 1)
	if failurePercent < 100-acceptSuccessPercents {
		log.Warningf("Encountered %v%% failure, we keep going. Errors: %v", failurePercent, recorder.Error())
		return nil
	}

	return recorder.Error()
}