// Scrap a tablet. If force is used, we write to topo.Server // directly and don't remote-execute the command. // // If we scrap the master for a shard, we will clear its record // from the Shard object (only if that was the right master) func (wr *Wrangler) Scrap(tabletAlias topo.TabletAlias, force, skipRebuild bool) error { // load the tablet, see if we'll need to rebuild ti, err := wr.ts.GetTablet(tabletAlias) if err != nil { return err } rebuildRequired := ti.Tablet.IsInServingGraph() wasMaster := ti.Type == topo.TYPE_MASTER if force { err = topotools.Scrap(wr.ts, ti.Alias, force) } else { err = wr.tmc.Scrap(wr.ctx, ti) } if err != nil { return err } if !rebuildRequired { wr.Logger().Infof("Rebuild not required") return nil } if skipRebuild { wr.Logger().Warningf("Rebuild required, but skipping it") return nil } // update the Shard object if the master was scrapped if wasMaster { actionNode := actionnode.UpdateShard() lockPath, err := wr.lockShard(ti.Keyspace, ti.Shard, actionNode) if err != nil { return err } // read the shard with the lock si, err := wr.ts.GetShard(ti.Keyspace, ti.Shard) if err != nil { return wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err) } // update it if the right alias is there if si.MasterAlias == tabletAlias { si.MasterAlias = topo.TabletAlias{} // write it back if err := topo.UpdateShard(context.TODO(), wr.ts, si); err != nil { return wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err) } } else { wr.Logger().Warningf("Scrapping master %v from shard %v/%v but master in Shard object was %v", tabletAlias, ti.Keyspace, ti.Shard, si.MasterAlias) } // and unlock if err := wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err); err != nil { return err } } // and rebuild the original shard / keyspace _, err = wr.RebuildShardGraph(ti.Keyspace, ti.Shard, []string{ti.Alias.Cell}) return err }
// Scrap scraps the live running tablet // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Scrap(ctx context.Context) error { return topotools.Scrap(agent.TopoServer, agent.TabletAlias, false) }
// Operate on restore tablet. // Check that the SnapshotManifest is valid and the master has not changed. // Shutdown mysqld. // Load the snapshot from source tablet. // Restart mysqld and replication. // Put tablet into the replication graph as a spare. // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Restore(ctx context.Context, args *actionnode.RestoreArgs, logger logutil.Logger) error { // read our current tablet, verify its state tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if args.WasReserved { if tablet.Type != topo.TYPE_RESTORE { return fmt.Errorf("expected restore type, not %v", tablet.Type) } } else { if tablet.Type != topo.TYPE_IDLE { return fmt.Errorf("expected idle type, not %v", tablet.Type) } } // read the source tablet, compute args.SrcFilePath if default sourceTablet, err := agent.TopoServer.GetTablet(args.SrcTabletAlias) if err != nil { return err } if strings.ToLower(args.SrcFilePath) == "default" { args.SrcFilePath = path.Join(mysqlctl.SnapshotURLPath, mysqlctl.SnapshotManifestFile) } // read the parent tablet, verify its state parentTablet, err := agent.TopoServer.GetTablet(args.ParentAlias) if err != nil { return err } if parentTablet.Type != topo.TYPE_MASTER && parentTablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("restore expected master or snapshot_source parent: %v %v", parentTablet.Type, args.ParentAlias) } // read & unpack the manifest sm := new(mysqlctl.SnapshotManifest) if err := fetchAndParseJsonFile(sourceTablet.Addr(), args.SrcFilePath, sm); err != nil { return err } if !args.WasReserved { if err := agent.changeTypeToRestore(ctx, tablet, sourceTablet, parentTablet.Alias, sourceTablet.KeyRange); err != nil { return err } } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // do the work if err := agent.Mysqld.RestoreFromSnapshot(l, sm, args.FetchConcurrency, args.FetchRetryCount, args.DontWaitForSlaveStart, agent.hookExtraEnv()); err != nil { log.Errorf("RestoreFromSnapshot failed (%v), scrapping", err) if err := topotools.Scrap(agent.TopoServer, agent.TabletAlias, false); err != nil { log.Errorf("Failed to Scrap after failed RestoreFromSnapshot: %v", err) } return err } // reload the schema agent.ReloadSchema(ctx) // change to TYPE_SPARE, we're done! return topotools.ChangeType(agent.TopoServer, agent.TabletAlias, topo.TYPE_SPARE, nil, true) }
// reparentShardBrutal executes a brutal reparent. // // Assume the master is dead and not coming back. Just push your way // forward. Force means we are reparenting to the same master // (assuming the data has been externally synched). // // The ev parameter is an event struct prefilled with information that the // caller has on hand, which would be expensive for us to re-query. func (wr *Wrangler) reparentShardBrutal(ev *events.Reparent, si *topo.ShardInfo, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, leaveMasterReadOnly, force bool) (err error) { event.DispatchUpdate(ev, "starting brutal") defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() wr.logger.Infof("Skipping ValidateShard - not a graceful situation") if _, ok := slaveTabletMap[masterElectTablet.Alias]; !ok && !force { return fmt.Errorf("master elect tablet not in replication graph %v %v/%v %v", masterElectTablet.Alias, si.Keyspace(), si.ShardName(), topotools.MapKeys(slaveTabletMap)) } // Check the master-elect and slaves are in good shape when the action // has not been forced. if !force { // Make sure all tablets have the right parent and reasonable positions. event.DispatchUpdate(ev, "checking slave replication positions") if err := wr.checkSlaveReplication(slaveTabletMap, topo.NO_TABLET); err != nil { return err } // Check the master-elect is fit for duty - call out for hardware checks. event.DispatchUpdate(ev, "checking that new master is ready to serve") if err := wr.checkMasterElect(masterElectTablet); err != nil { return err } event.DispatchUpdate(ev, "checking slave consistency") wr.logger.Infof("check slaves %v/%v", masterElectTablet.Keyspace, masterElectTablet.Shard) restartableSlaveTabletMap := wr.restartableTabletMap(slaveTabletMap) err = wr.checkSlaveConsistency(restartableSlaveTabletMap, myproto.ReplicationPosition{}) if err != nil { return err } } else { event.DispatchUpdate(ev, "stopping slave replication") wr.logger.Infof("forcing reparent to same master %v", masterElectTablet.Alias) err := wr.breakReplication(slaveTabletMap, masterElectTablet) if err != nil { return err } } event.DispatchUpdate(ev, "promoting new master") rsd, err := wr.promoteSlave(masterElectTablet) if err != nil { // FIXME(msolomon) This suggests that the master-elect is dead. // We need to classify certain errors as temporary and retry. return fmt.Errorf("promote slave failed: %v %v", err, masterElectTablet.Alias) } // Once the slave is promoted, remove it from our maps delete(slaveTabletMap, masterElectTablet.Alias) delete(masterTabletMap, masterElectTablet.Alias) event.DispatchUpdate(ev, "restarting slaves") majorityRestart, restartSlaveErr := wr.restartSlaves(slaveTabletMap, rsd) if !force { for _, failedMaster := range masterTabletMap { event.DispatchUpdate(ev, "scrapping old master") wr.logger.Infof("scrap dead master %v", failedMaster.Alias) // The master is dead so execute the action locally instead of // enqueing the scrap action for an arbitrary amount of time. if scrapErr := topotools.Scrap(wr.ts, failedMaster.Alias, false); scrapErr != nil { wr.logger.Warningf("scrapping failed master failed: %v", scrapErr) } } } event.DispatchUpdate(ev, "rebuilding shard serving graph") err = wr.finishReparent(si, masterElectTablet, majorityRestart, leaveMasterReadOnly) if err != nil { return err } event.DispatchUpdate(ev, "finished") if restartSlaveErr != nil { // This is more of a warning at this point. return restartSlaveErr } return nil }