func tabletExternallyReparentedLocked(ts topo.Server, tablet *topo.TabletInfo, actionTimeout, lockTimeout time.Duration, interrupted chan struct{}) (err error) { // read the shard, make sure again the master is not already good. // critical read, we want up to date info (and the shard is locked). shardInfo, err := ts.GetShardCritical(tablet.Keyspace, tablet.Shard) if err != nil { return err } if shardInfo.MasterAlias == tablet.Alias { return fmt.Errorf("this tablet is already the master") } // Read the tablets, make sure the master elect is known to the shard // (it's this tablet, so it better be!). // Note we will keep going with a partial tablet map, which usually // happens when a cell is not reachable. After these checks, the // guarantees we'll have are: // - global cell is reachable (we just locked and read the shard) // - the local cell that contains the new master is reachable // (as we're going to check the new master is in the list) // That should be enough. tabletMap, err := topo.GetTabletMapForShard(ts, tablet.Keyspace, tablet.Shard) switch err { case nil: // keep going case topo.ErrPartialResult: log.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets") default: return err } masterElectTablet, ok := tabletMap[tablet.Alias] if !ok { return fmt.Errorf("this master-elect tablet %v not found in replication graph %v/%v %v", tablet.Alias, tablet.Keyspace, tablet.Shard, topotools.MapKeys(tabletMap)) } // Create reusable Reparent event with available info ev := &events.Reparent{ ShardInfo: *shardInfo, NewMaster: *tablet.Tablet, } if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok { ev.OldMaster = *oldMasterTablet.Tablet } defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // sort the tablets, and handle them slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap) event.DispatchUpdate(ev, "starting external from tablet") // we fix the new master in the replication graph event.DispatchUpdate(ev, "mark ourself as new master") err = updateReplicationGraphForPromotedSlave(ts, tablet) if err != nil { // This suggests we can't talk to topo server. This is bad. return fmt.Errorf("updateReplicationGraphForPromotedSlave failed: %v", err) } // Once this tablet is promoted, remove it from our maps delete(slaveTabletMap, tablet.Alias) delete(masterTabletMap, tablet.Alias) // Then fix all the slaves, including the old master. This // last step is very likely to time out for some tablets (one // random guy is dead, the old master is dead, ...). We // execute them all in parallel until we get to // wr.ActionTimeout(). After this, no other action with a // timeout is executed, so even if we got to the timeout, // we're still good. event.DispatchUpdate(ev, "restarting slaves") logger := logutil.NewConsoleLogger() ai := initiator.NewActionInitiator(ts) topotools.RestartSlavesExternal(ts, logger, slaveTabletMap, masterTabletMap, masterElectTablet.Alias, func(ti *topo.TabletInfo, swrd *actionnode.SlaveWasRestartedArgs) error { return ai.RpcSlaveWasRestarted(ti, swrd, actionTimeout) }) // Compute the list of Cells we need to rebuild: old master and // all other cells if reparenting to another cell. cells := []string{shardInfo.MasterAlias.Cell} if shardInfo.MasterAlias.Cell != tablet.Alias.Cell { cells = nil } // now update the master record in the shard object event.DispatchUpdate(ev, "updating shard record") log.Infof("Updating Shard's MasterAlias record") shardInfo.MasterAlias = tablet.Alias if err = topo.UpdateShard(ts, shardInfo); err != nil { return err } // and rebuild the shard serving graph event.DispatchUpdate(ev, "rebuilding shard serving graph") log.Infof("Rebuilding shard serving graph data") if err = topotools.RebuildShard(logger, ts, tablet.Keyspace, tablet.Shard, cells, lockTimeout, interrupted); err != nil { return err } event.DispatchUpdate(ev, "finished") return nil }
func (wr *Wrangler) shardExternallyReparentedLocked(keyspace, shard string, masterElectTabletAlias topo.TabletAlias) (err error) { // read the shard, make sure the master is not already good. shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } if shardInfo.MasterAlias == masterElectTabletAlias { return fmt.Errorf("master-elect tablet %v is already master", masterElectTabletAlias) } // Read the tablets, make sure the master elect is known to us. // Note we will keep going with a partial tablet map, which usually // happens when a cell is not reachable. After these checks, the // guarantees we'll have are: // - global cell is reachable (we just locked and read the shard) // - the local cell that contains the new master is reachable // (as we're going to check the new master is in the list) // That should be enough. tabletMap, err := topo.GetTabletMapForShard(wr.ts, keyspace, shard) switch err { case nil: // keep going case topo.ErrPartialResult: wr.logger.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets") default: return err } masterElectTablet, ok := tabletMap[masterElectTabletAlias] if !ok { return fmt.Errorf("master-elect tablet %v not found in replication graph %v/%v %v", masterElectTabletAlias, keyspace, shard, topotools.MapKeys(tabletMap)) } // Create reusable Reparent event with available info ev := &events.Reparent{ ShardInfo: *shardInfo, NewMaster: *masterElectTablet.Tablet, } if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok { ev.OldMaster = *oldMasterTablet.Tablet } defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // sort the tablets, and handle them slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap) err = wr.reparentShardExternal(ev, slaveTabletMap, masterTabletMap, masterElectTablet) if err != nil { wr.logger.Infof("Skipping shard rebuild with failed reparent") return err } // Compute the list of Cells we need to rebuild: old master and // all other cells if reparenting to another cell. cells := []string{shardInfo.MasterAlias.Cell} if shardInfo.MasterAlias.Cell != masterElectTabletAlias.Cell { cells = nil } // now update the master record in the shard object event.DispatchUpdate(ev, "updating shard record") wr.logger.Infof("Updating Shard's MasterAlias record") shardInfo.MasterAlias = masterElectTabletAlias if err = topo.UpdateShard(wr.ts, shardInfo); err != nil { return err } // and rebuild the shard serving graph event.DispatchUpdate(ev, "rebuilding shard serving graph") wr.logger.Infof("Rebuilding shard serving graph data") if _, err = topotools.RebuildShard(wr.logger, wr.ts, masterElectTablet.Keyspace, masterElectTablet.Shard, cells, wr.lockTimeout, interrupted); err != nil { return err } event.DispatchUpdate(ev, "finished") return nil }
// reparentShardGraceful executes a graceful reparent. // The ev parameter is an event struct prefilled with information that the // caller has on hand, which would be expensive for us to re-query. func (wr *Wrangler) reparentShardGraceful(ev *events.Reparent, si *topo.ShardInfo, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, leaveMasterReadOnly bool) (err error) { event.DispatchUpdate(ev, "starting graceful") defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // Validate a bunch of assumptions we make about the replication graph. if len(masterTabletMap) != 1 { aliases := make([]string, 0, len(masterTabletMap)) for _, v := range masterTabletMap { aliases = append(aliases, v.String()) } return fmt.Errorf("I have 0 or multiple masters / scrapped tablets in this shard replication graph, please scrap the non-master ones: %v", strings.Join(aliases, " ")) } var masterTablet *topo.TabletInfo for _, v := range masterTabletMap { masterTablet = v } if masterTablet.Parent.Uid != topo.NO_TABLET { return fmt.Errorf("master tablet should not have a ParentUid: %v %v", masterTablet.Parent.Uid, masterTablet.Alias) } if masterTablet.Type != topo.TYPE_MASTER { return fmt.Errorf("master tablet should not be type: %v %v", masterTablet.Type, masterTablet.Alias) } if masterTablet.Alias.Uid == masterElectTablet.Alias.Uid { return fmt.Errorf("master tablet should not match master elect - this must be forced: %v", masterTablet.Alias) } if _, ok := slaveTabletMap[masterElectTablet.Alias]; !ok { return fmt.Errorf("master elect tablet not in replication graph %v %v/%v %v", masterElectTablet.Alias, masterTablet.Keyspace, masterTablet.Shard, topotools.MapKeys(slaveTabletMap)) } if err := wr.ValidateShard(masterTablet.Keyspace, masterTablet.Shard, true); err != nil { return fmt.Errorf("ValidateShard verification failed: %v, if the master is dead, run: vtctl ScrapTablet -force %v", err, masterTablet.Alias) } // Make sure all tablets have the right parent and reasonable positions. event.DispatchUpdate(ev, "checking slave replication positions") err = wr.checkSlaveReplication(slaveTabletMap, masterTablet.Alias.Uid) if err != nil { return err } // Check the master-elect is fit for duty - call out for hardware checks. event.DispatchUpdate(ev, "checking that new master is ready to serve") err = wr.checkMasterElect(masterElectTablet) if err != nil { return err } event.DispatchUpdate(ev, "demoting old master") masterPosition, err := wr.demoteMaster(masterTablet) if err != nil { // FIXME(msolomon) This suggests that the master is dead and we // need to take steps. We could either pop a prompt, or make // retrying the action painless. return fmt.Errorf("demote master failed: %v, if the master is dead, run: vtctl -force ScrapTablet %v", err, masterTablet.Alias) } event.DispatchUpdate(ev, "checking slave consistency") wr.logger.Infof("check slaves %v/%v", masterTablet.Keyspace, masterTablet.Shard) restartableSlaveTabletMap := wr.restartableTabletMap(slaveTabletMap) err = wr.checkSlaveConsistency(restartableSlaveTabletMap, masterPosition) if err != nil { return fmt.Errorf("check slave consistency failed %v, demoted master is still read only, run: vtctl SetReadWrite %v", err, masterTablet.Alias) } event.DispatchUpdate(ev, "promoting new master") rsd, err := wr.promoteSlave(masterElectTablet) if err != nil { // FIXME(msolomon) This suggests that the master-elect is dead. // We need to classify certain errors as temporary and retry. return fmt.Errorf("promote slave failed: %v, demoted master is still read only: vtctl SetReadWrite %v", err, masterTablet.Alias) } // Once the slave is promoted, remove it from our map delete(slaveTabletMap, masterElectTablet.Alias) event.DispatchUpdate(ev, "restarting slaves") majorityRestart, restartSlaveErr := wr.restartSlaves(slaveTabletMap, rsd) // For now, scrap the old master regardless of how many // slaves restarted. // // FIXME(msolomon) We could reintroduce it and reparent it and use // it as new replica. event.DispatchUpdate(ev, "scrapping old master") wr.logger.Infof("scrap demoted master %v", masterTablet.Alias) if scrapErr := wr.tmc.Scrap(masterTablet, wr.ActionTimeout()); scrapErr != nil { // The sub action is non-critical, so just warn. wr.logger.Warningf("scrap demoted master failed: %v", scrapErr) } event.DispatchUpdate(ev, "rebuilding shard serving graph") err = wr.finishReparent(si, masterElectTablet, majorityRestart, leaveMasterReadOnly) if err != nil { return err } event.DispatchUpdate(ev, "finished") if restartSlaveErr != nil { // This is more of a warning at this point. return restartSlaveErr } return nil }
func (wr *Wrangler) reparentShardLocked(keyspace, shard string, masterElectTabletAlias topo.TabletAlias, leaveMasterReadOnly, forceReparentToCurrentMaster bool) error { // critical read, we want up to date info (and the shard is locked). shardInfo, err := wr.ts.GetShardCritical(keyspace, shard) if err != nil { return err } tabletMap, err := topo.GetTabletMapForShard(wr.ts, keyspace, shard) if err != nil { return err } slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap) if shardInfo.MasterAlias == masterElectTabletAlias && !forceReparentToCurrentMaster { return fmt.Errorf("master-elect tablet %v is already master - specify -force to override", masterElectTabletAlias) } masterElectTablet, ok := tabletMap[masterElectTabletAlias] if !ok { return fmt.Errorf("master-elect tablet %v not found in replication graph %v/%v %v", masterElectTabletAlias, keyspace, shard, topotools.MapKeys(tabletMap)) } // Create reusable Reparent event with available info ev := &events.Reparent{ ShardInfo: *shardInfo, NewMaster: *masterElectTablet.Tablet, } if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok { ev.OldMaster = *oldMasterTablet.Tablet } if !shardInfo.MasterAlias.IsZero() && !forceReparentToCurrentMaster { err = wr.reparentShardGraceful(ev, shardInfo, slaveTabletMap, masterTabletMap, masterElectTablet, leaveMasterReadOnly) } else { err = wr.reparentShardBrutal(ev, shardInfo, slaveTabletMap, masterTabletMap, masterElectTablet, leaveMasterReadOnly, forceReparentToCurrentMaster) } if err == nil { // only log if it works, if it fails we'll show the error wr.Logger().Infof("reparentShard finished") } return err }
// reparentShardBrutal executes a brutal reparent. // // Assume the master is dead and not coming back. Just push your way // forward. Force means we are reparenting to the same master // (assuming the data has been externally synched). // // The ev parameter is an event struct prefilled with information that the // caller has on hand, which would be expensive for us to re-query. func (wr *Wrangler) reparentShardBrutal(ev *events.Reparent, si *topo.ShardInfo, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, leaveMasterReadOnly, force bool) (err error) { event.DispatchUpdate(ev, "starting brutal") defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() wr.logger.Infof("Skipping ValidateShard - not a graceful situation") if _, ok := slaveTabletMap[masterElectTablet.Alias]; !ok && !force { return fmt.Errorf("master elect tablet not in replication graph %v %v/%v %v", masterElectTablet.Alias, si.Keyspace(), si.ShardName(), topotools.MapKeys(slaveTabletMap)) } // Check the master-elect and slaves are in good shape when the action // has not been forced. if !force { // Make sure all tablets have the right parent and reasonable positions. event.DispatchUpdate(ev, "checking slave replication positions") if err := wr.checkSlaveReplication(slaveTabletMap, topo.NO_TABLET); err != nil { return err } // Check the master-elect is fit for duty - call out for hardware checks. event.DispatchUpdate(ev, "checking that new master is ready to serve") if err := wr.checkMasterElect(masterElectTablet); err != nil { return err } event.DispatchUpdate(ev, "checking slave consistency") wr.logger.Infof("check slaves %v/%v", masterElectTablet.Keyspace, masterElectTablet.Shard) restartableSlaveTabletMap := wr.restartableTabletMap(slaveTabletMap) err = wr.checkSlaveConsistency(restartableSlaveTabletMap, myproto.ReplicationPosition{}) if err != nil { return err } } else { event.DispatchUpdate(ev, "stopping slave replication") wr.logger.Infof("forcing reparent to same master %v", masterElectTablet.Alias) err := wr.breakReplication(slaveTabletMap, masterElectTablet) if err != nil { return err } } event.DispatchUpdate(ev, "promoting new master") rsd, err := wr.promoteSlave(masterElectTablet) if err != nil { // FIXME(msolomon) This suggests that the master-elect is dead. // We need to classify certain errors as temporary and retry. return fmt.Errorf("promote slave failed: %v %v", err, masterElectTablet.Alias) } // Once the slave is promoted, remove it from our maps delete(slaveTabletMap, masterElectTablet.Alias) delete(masterTabletMap, masterElectTablet.Alias) event.DispatchUpdate(ev, "restarting slaves") majorityRestart, restartSlaveErr := wr.restartSlaves(slaveTabletMap, rsd) if !force { for _, failedMaster := range masterTabletMap { event.DispatchUpdate(ev, "scrapping old master") wr.logger.Infof("scrap dead master %v", failedMaster.Alias) // The master is dead so execute the action locally instead of // enqueing the scrap action for an arbitrary amount of time. if scrapErr := topotools.Scrap(wr.ts, failedMaster.Alias, false); scrapErr != nil { wr.logger.Warningf("scrapping failed master failed: %v", scrapErr) } } } event.DispatchUpdate(ev, "rebuilding shard serving graph") err = wr.finishReparent(si, masterElectTablet, majorityRestart, leaveMasterReadOnly) if err != nil { return err } event.DispatchUpdate(ev, "finished") if restartSlaveErr != nil { // This is more of a warning at this point. return restartSlaveErr } return nil }
// Check all the tablets to see if we can proceed with reparenting. // masterPosition is supplied from the demoted master if we are doing // this gracefully. func (wr *Wrangler) checkSlaveConsistency(tabletMap map[uint32]*topo.TabletInfo, masterPosition myproto.ReplicationPosition) error { wr.logger.Infof("checkSlaveConsistency %v %#v", topotools.MapKeys(tabletMap), masterPosition) // FIXME(msolomon) Something still feels clumsy here and I can't put my finger on it. calls := make(chan *rpcContext, len(tabletMap)) f := func(ti *topo.TabletInfo) { ctx := &rpcContext{tablet: ti} defer func() { calls <- ctx }() if !masterPosition.IsZero() { // If the master position is known, do our best to wait for replication to catch up. status, err := wr.tmc.WaitSlavePosition(ti, masterPosition, wr.ActionTimeout()) if err != nil { ctx.err = err return } ctx.status = status } else { // If the master is down, just get the slave status. status, err := wr.tmc.SlaveStatus(ti, wr.ActionTimeout()) if err != nil { ctx.err = err return } ctx.status = status } } for _, tablet := range tabletMap { // Pass loop variable explicitly so we don't have a concurrency issue. go f(tablet) } // map positions to tablets positionMap := make(map[string][]uint32) for i := 0; i < len(tabletMap); i++ { ctx := <-calls mapKey := "unavailable-tablet-error" if ctx.err == nil { mapKey = ctx.status.Position.String() } if _, ok := positionMap[mapKey]; !ok { positionMap[mapKey] = make([]uint32, 0, 32) } positionMap[mapKey] = append(positionMap[mapKey], ctx.tablet.Alias.Uid) } if len(positionMap) == 1 { // great, everyone agrees // demotedMasterReplicationState is nil if demotion failed if !masterPosition.IsZero() { demotedMapKey := masterPosition.String() if _, ok := positionMap[demotedMapKey]; !ok { for slaveMapKey := range positionMap { return fmt.Errorf("slave position doesn't match demoted master: %v != %v", demotedMapKey, slaveMapKey) } } } } else { // FIXME(msolomon) in the event of a crash, do you pick replica that is // furthest along or do you promote the majority? data loss vs availability // sounds like you pick the latest group and reclone. items := make([]string, 0, 32) for slaveMapKey, uids := range positionMap { tabletPaths := make([]string, len(uids)) for i, uid := range uids { tabletPaths[i] = tabletMap[uid].Alias.String() } items = append(items, fmt.Sprintf(" %v\n %v", slaveMapKey, strings.Join(tabletPaths, "\n "))) } sort.Strings(items) return fmt.Errorf("inconsistent slaves, mark some offline with vtctl ScrapTablet\n%v", strings.Join(items, "\n")) } return nil }