func (ta *TabletActor) multiRestore(actionNode *actionnode.ActionNode) (err error) { args := actionNode.Args.(*actionnode.MultiRestoreArgs) // read our current tablet, verify its state // we only support restoring to the master or spare replicas tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } if tablet.Type != topo.TYPE_MASTER && tablet.Type != topo.TYPE_SPARE && tablet.Type != topo.TYPE_REPLICA && tablet.Type != topo.TYPE_RDONLY { return fmt.Errorf("expected master, spare replica or rdonly type, not %v: %v", tablet.Type, ta.tabletAlias) } // get source tablets addresses sourceAddrs := make([]*url.URL, len(args.SrcTabletAliases)) keyRanges := make([]key.KeyRange, len(args.SrcTabletAliases)) for i, alias := range args.SrcTabletAliases { t, e := ta.ts.GetTablet(alias) if e != nil { return e } sourceAddrs[i] = &url.URL{Host: t.GetAddr(), Path: "/" + t.DbName()} keyRanges[i], e = key.KeyRangesOverlap(tablet.KeyRange, t.KeyRange) if e != nil { return e } } // change type to restore, no change to replication graph originalType := tablet.Type tablet.Type = topo.TYPE_RESTORE err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // run the action, scrap if it fails if err := ta.mysqld.MultiRestore(tablet.DbName(), keyRanges, sourceAddrs, args.Concurrency, args.FetchConcurrency, args.InsertTableConcurrency, args.FetchRetryCount, args.Strategy); err != nil { if e := actionnode.Scrap(ta.ts, ta.tabletAlias, false); e != nil { log.Errorf("Failed to Scrap after failed RestoreFromMultiSnapshot: %v", e) } return err } // restore type back tablet.Type = originalType return topo.UpdateTablet(ta.ts, tablet) }
// Operate on restore tablet. // Check that the SnapshotManifest is valid and the master has not changed. // Shutdown mysqld. // Load the snapshot from source tablet. // Restart mysqld and replication. // Put tablet into the replication graph as a spare. func (ta *TabletActor) restore(actionNode *actionnode.ActionNode) error { args := actionNode.Args.(*actionnode.RestoreArgs) // read our current tablet, verify its state tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } if args.WasReserved { if tablet.Type != topo.TYPE_RESTORE { return fmt.Errorf("expected restore type, not %v: %v", tablet.Type, ta.tabletAlias) } } else { if tablet.Type != topo.TYPE_IDLE { return fmt.Errorf("expected idle type, not %v: %v", tablet.Type, ta.tabletAlias) } } // read the source tablet, compute args.SrcFilePath if default sourceTablet, err := ta.ts.GetTablet(args.SrcTabletAlias) if err != nil { return err } if strings.ToLower(args.SrcFilePath) == "default" { args.SrcFilePath = path.Join(mysqlctl.SnapshotURLPath, mysqlctl.SnapshotManifestFile) } // read the parent tablet, verify its state parentTablet, err := ta.ts.GetTablet(args.ParentAlias) if err != nil { return err } if parentTablet.Type != topo.TYPE_MASTER && parentTablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("restore expected master or snapshot_source parent: %v %v", parentTablet.Type, args.ParentAlias) } // read & unpack the manifest sm := new(mysqlctl.SnapshotManifest) if err := fetchAndParseJsonFile(sourceTablet.GetAddr(), args.SrcFilePath, sm); err != nil { return err } if !args.WasReserved { if err := ta.changeTypeToRestore(tablet, sourceTablet, parentTablet.Alias, sourceTablet.KeyRange); err != nil { return err } } // do the work if err := ta.mysqld.RestoreFromSnapshot(sm, args.FetchConcurrency, args.FetchRetryCount, args.DontWaitForSlaveStart, ta.hookExtraEnv()); err != nil { log.Errorf("RestoreFromSnapshot failed (%v), scrapping", err) if err := actionnode.Scrap(ta.ts, ta.tabletAlias, false); err != nil { log.Errorf("Failed to Scrap after failed RestoreFromSnapshot: %v", err) } return err } // change to TYPE_SPARE, we're done! return actionnode.ChangeType(ta.ts, ta.tabletAlias, topo.TYPE_SPARE, nil, true) }
// Scrap a tablet. If force is used, we write to topo.Server // directly and don't remote-execute the command. // // If we scrap the master for a shard, we will clear its record // from the Shard object (only if that was the right master) func (wr *Wrangler) Scrap(tabletAlias topo.TabletAlias, force, skipRebuild bool) (actionPath string, err error) { // load the tablet, see if we'll need to rebuild ti, err := wr.ts.GetTablet(tabletAlias) if err != nil { return "", err } rebuildRequired := ti.Tablet.IsInServingGraph() wasMaster := ti.Type == topo.TYPE_MASTER if force { err = actionnode.Scrap(wr.ts, ti.Alias, force) } else { actionPath, err = wr.ai.Scrap(ti.Alias) } if err != nil { return "", err } if !rebuildRequired { log.Infof("Rebuild not required") return } if skipRebuild { log.Warningf("Rebuild required, but skipping it") return } // wait for the remote Scrap if necessary if actionPath != "" { err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) if err != nil { return "", err } } // update the Shard object if the master was scrapped if wasMaster { actionNode := actionnode.UpdateShard() lockPath, err := wr.lockShard(ti.Keyspace, ti.Shard, actionNode) if err != nil { return "", err } // read the shard with the lock si, err := wr.ts.GetShard(ti.Keyspace, ti.Shard) if err != nil { return "", wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err) } // update it if the right alias is there if si.MasterAlias == tabletAlias { si.MasterAlias = topo.TabletAlias{} // write it back if err := wr.ts.UpdateShard(si); err != nil { return "", wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err) } } else { log.Warningf("Scrapping master %v from shard %v/%v but master in Shard object was %v", tabletAlias, ti.Keyspace, ti.Shard, si.MasterAlias) } // and unlock if err := wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err); err != nil { return "", err } } // and rebuild the original shard / keyspace return "", wr.RebuildShardGraph(ti.Keyspace, ti.Shard, []string{ti.Alias.Cell}) }
func (ta *TabletActor) scrap() error { return actionnode.Scrap(ta.ts, ta.tabletAlias, false) }
// Assume the master is dead and not coming back. Just push your way // forward. Force means we are reparenting to the same master // (assuming the data has been externally synched). func (wr *Wrangler) reparentShardBrutal(si *topo.ShardInfo, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, leaveMasterReadOnly, force bool) error { log.Infof("Skipping ValidateShard - not a graceful situation") if _, ok := slaveTabletMap[masterElectTablet.Alias]; !ok && !force { return fmt.Errorf("master elect tablet not in replication graph %v %v/%v %v", masterElectTablet.Alias, si.Keyspace(), si.ShardName(), mapKeys(slaveTabletMap)) } // Check the master-elect and slaves are in good shape when the action // has not been forced. if !force { // Make sure all tablets have the right parent and reasonable positions. if err := wr.checkSlaveReplication(slaveTabletMap, topo.NO_TABLET); err != nil { return err } // Check the master-elect is fit for duty - call out for hardware checks. if err := wr.checkMasterElect(masterElectTablet); err != nil { return err } log.Infof("check slaves %v/%v", masterElectTablet.Keyspace, masterElectTablet.Shard) restartableSlaveTabletMap := restartableTabletMap(slaveTabletMap) err := wr.checkSlaveConsistency(restartableSlaveTabletMap, nil) if err != nil { return err } } else { log.Infof("forcing reparent to same master %v", masterElectTablet.Alias) err := wr.breakReplication(slaveTabletMap, masterElectTablet) if err != nil { return err } } rsd, err := wr.promoteSlave(masterElectTablet) if err != nil { // FIXME(msolomon) This suggests that the master-elect is dead. // We need to classify certain errors as temporary and retry. return fmt.Errorf("promote slave failed: %v %v", err, masterElectTablet.Alias) } // Once the slave is promoted, remove it from our maps delete(slaveTabletMap, masterElectTablet.Alias) delete(masterTabletMap, masterElectTablet.Alias) majorityRestart, restartSlaveErr := wr.restartSlaves(slaveTabletMap, rsd) if !force { for _, failedMaster := range masterTabletMap { log.Infof("scrap dead master %v", failedMaster.Alias) // The master is dead so execute the action locally instead of // enqueing the scrap action for an arbitrary amount of time. if scrapErr := actionnode.Scrap(wr.ts, failedMaster.Alias, false); scrapErr != nil { log.Warningf("scrapping failed master failed: %v", scrapErr) } } } err = wr.finishReparent(si, masterElectTablet, majorityRestart, leaveMasterReadOnly) if err != nil { return err } if restartSlaveErr != nil { // This is more of a warning at this point. return restartSlaveErr } return nil }