// reparentShardExternal handles an external reparent. // // The ev parameter is an event struct prefilled with information that the // caller has on hand, which would be expensive for us to re-query. func (wr *Wrangler) reparentShardExternal(ev *events.Reparent, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo) error { ev.UpdateStatus("starting external") // we fix the new master in the replication graph ev.UpdateStatus("checking if new master was promoted") err := wr.slaveWasPromoted(masterElectTablet) if err != nil { // This suggests that the master-elect is dead. This is bad. return fmt.Errorf("slaveWasPromoted(%v) failed: %v", masterElectTablet, err) } // Once the slave is promoted, remove it from our maps delete(slaveTabletMap, masterElectTablet.Alias) delete(masterTabletMap, masterElectTablet.Alias) // Re-read the master elect tablet, as its mysql port may have been updated ev.UpdateStatus("re-reading new master record") masterElectTablet, err = wr.TopoServer().GetTablet(masterElectTablet.Alias) if err != nil { return fmt.Errorf("cannot re-read the master record, something is seriously wrong: %v", err) } // then fix all the slaves, including the old master ev.UpdateStatus("restarting slaves") wr.restartSlavesExternal(slaveTabletMap, masterTabletMap, masterElectTablet) return nil }
// reparentShardGraceful executes a graceful reparent. // The ev parameter is an event struct prefilled with information that the // caller has on hand, which would be expensive for us to re-query. func (wr *Wrangler) reparentShardGraceful(ev *events.Reparent, si *topo.ShardInfo, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, leaveMasterReadOnly bool) (err error) { ev.UpdateStatus("starting graceful") defer func() { if err != nil { ev.UpdateStatus("failed: " + err.Error()) } }() // Validate a bunch of assumptions we make about the replication graph. if len(masterTabletMap) != 1 { aliases := make([]string, 0, len(masterTabletMap)) for _, v := range masterTabletMap { aliases = append(aliases, v.String()) } return fmt.Errorf("I have 0 or multiple masters / scrapped tablets in this shard replication graph, please scrap the non-master ones: %v", strings.Join(aliases, " ")) } var masterTablet *topo.TabletInfo for _, v := range masterTabletMap { masterTablet = v } if masterTablet.Parent.Uid != topo.NO_TABLET { return fmt.Errorf("master tablet should not have a ParentUid: %v %v", masterTablet.Parent.Uid, masterTablet.Alias) } if masterTablet.Type != topo.TYPE_MASTER { return fmt.Errorf("master tablet should not be type: %v %v", masterTablet.Type, masterTablet.Alias) } if masterTablet.Alias.Uid == masterElectTablet.Alias.Uid { return fmt.Errorf("master tablet should not match master elect - this must be forced: %v", masterTablet.Alias) } if _, ok := slaveTabletMap[masterElectTablet.Alias]; !ok { return fmt.Errorf("master elect tablet not in replication graph %v %v/%v %v", masterElectTablet.Alias, masterTablet.Keyspace, masterTablet.Shard, mapKeys(slaveTabletMap)) } if err := wr.ValidateShard(masterTablet.Keyspace, masterTablet.Shard, true); err != nil { return fmt.Errorf("ValidateShard verification failed: %v, if the master is dead, run: vtctl ScrapTablet -force %v", err, masterTablet.Alias) } // Make sure all tablets have the right parent and reasonable positions. ev.UpdateStatus("checking slave replication positions") err = wr.checkSlaveReplication(slaveTabletMap, masterTablet.Alias.Uid) if err != nil { return err } // Check the master-elect is fit for duty - call out for hardware checks. ev.UpdateStatus("checking that new master is ready to serve") err = wr.checkMasterElect(masterElectTablet) if err != nil { return err } ev.UpdateStatus("demoting old master") masterPosition, err := wr.demoteMaster(masterTablet) if err != nil { // FIXME(msolomon) This suggests that the master is dead and we // need to take steps. We could either pop a prompt, or make // retrying the action painless. return fmt.Errorf("demote master failed: %v, if the master is dead, run: vtctl -force ScrapTablet %v", err, masterTablet.Alias) } ev.UpdateStatus("checking slave consistency") log.Infof("check slaves %v/%v", masterTablet.Keyspace, masterTablet.Shard) restartableSlaveTabletMap := restartableTabletMap(slaveTabletMap) err = wr.checkSlaveConsistency(restartableSlaveTabletMap, masterPosition) if err != nil { return fmt.Errorf("check slave consistency failed %v, demoted master is still read only, run: vtctl SetReadWrite %v", err, masterTablet.Alias) } ev.UpdateStatus("promoting new master") rsd, err := wr.promoteSlave(masterElectTablet) if err != nil { // FIXME(msolomon) This suggests that the master-elect is dead. // We need to classify certain errors as temporary and retry. return fmt.Errorf("promote slave failed: %v, demoted master is still read only: vtctl SetReadWrite %v", err, masterTablet.Alias) } // Once the slave is promoted, remove it from our map delete(slaveTabletMap, masterElectTablet.Alias) ev.UpdateStatus("restarting slaves") majorityRestart, restartSlaveErr := wr.restartSlaves(slaveTabletMap, rsd) // For now, scrap the old master regardless of how many // slaves restarted. // // FIXME(msolomon) We could reintroduce it and reparent it and use // it as new replica. ev.UpdateStatus("scrapping old master") log.Infof("scrap demoted master %v", masterTablet.Alias) scrapActionPath, scrapErr := wr.ai.Scrap(masterTablet.Alias) if scrapErr == nil { scrapErr = wr.WaitForCompletion(scrapActionPath) } if scrapErr != nil { // The sub action is non-critical, so just warn. log.Warningf("scrap demoted master failed: %v", scrapErr) } ev.UpdateStatus("rebuilding shard serving graph") err = wr.finishReparent(si, masterElectTablet, majorityRestart, leaveMasterReadOnly) if err != nil { return err } ev.UpdateStatus("finished") if restartSlaveErr != nil { // This is more of a warning at this point. return restartSlaveErr } return nil }
// reparentShardBrutal executes a brutal reparent. // // Assume the master is dead and not coming back. Just push your way // forward. Force means we are reparenting to the same master // (assuming the data has been externally synched). // // The ev parameter is an event struct prefilled with information that the // caller has on hand, which would be expensive for us to re-query. func (wr *Wrangler) reparentShardBrutal(ev *events.Reparent, si *topo.ShardInfo, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, leaveMasterReadOnly, force bool) (err error) { ev.UpdateStatus("starting brutal") defer func() { if err != nil { ev.UpdateStatus("failed: " + err.Error()) } }() log.Infof("Skipping ValidateShard - not a graceful situation") if _, ok := slaveTabletMap[masterElectTablet.Alias]; !ok && !force { return fmt.Errorf("master elect tablet not in replication graph %v %v/%v %v", masterElectTablet.Alias, si.Keyspace(), si.ShardName(), mapKeys(slaveTabletMap)) } // Check the master-elect and slaves are in good shape when the action // has not been forced. if !force { // Make sure all tablets have the right parent and reasonable positions. ev.UpdateStatus("checking slave replication positions") if err := wr.checkSlaveReplication(slaveTabletMap, topo.NO_TABLET); err != nil { return err } // Check the master-elect is fit for duty - call out for hardware checks. ev.UpdateStatus("checking that new master is ready to serve") if err := wr.checkMasterElect(masterElectTablet); err != nil { return err } ev.UpdateStatus("checking slave consistency") log.Infof("check slaves %v/%v", masterElectTablet.Keyspace, masterElectTablet.Shard) restartableSlaveTabletMap := restartableTabletMap(slaveTabletMap) err = wr.checkSlaveConsistency(restartableSlaveTabletMap, nil) if err != nil { return err } } else { ev.UpdateStatus("stopping slave replication") log.Infof("forcing reparent to same master %v", masterElectTablet.Alias) err := wr.breakReplication(slaveTabletMap, masterElectTablet) if err != nil { return err } } ev.UpdateStatus("promoting new master") rsd, err := wr.promoteSlave(masterElectTablet) if err != nil { // FIXME(msolomon) This suggests that the master-elect is dead. // We need to classify certain errors as temporary and retry. return fmt.Errorf("promote slave failed: %v %v", err, masterElectTablet.Alias) } // Once the slave is promoted, remove it from our maps delete(slaveTabletMap, masterElectTablet.Alias) delete(masterTabletMap, masterElectTablet.Alias) ev.UpdateStatus("restarting slaves") majorityRestart, restartSlaveErr := wr.restartSlaves(slaveTabletMap, rsd) if !force { for _, failedMaster := range masterTabletMap { ev.UpdateStatus("scrapping old master") log.Infof("scrap dead master %v", failedMaster.Alias) // The master is dead so execute the action locally instead of // enqueing the scrap action for an arbitrary amount of time. if scrapErr := topotools.Scrap(wr.ts, failedMaster.Alias, false); scrapErr != nil { log.Warningf("scrapping failed master failed: %v", scrapErr) } } } ev.UpdateStatus("rebuilding shard serving graph") err = wr.finishReparent(si, masterElectTablet, majorityRestart, leaveMasterReadOnly) if err != nil { return err } ev.UpdateStatus("finished") if restartSlaveErr != nil { // This is more of a warning at this point. return restartSlaveErr } return nil }