// SnapshotSourceEnd restores the state of the server after a // Snapshot(server_mode =true) // Should be called under RpcWrapLockAction. func (agent *ActionAgent) SnapshotSourceEnd(ctx context.Context, args *actionnode.SnapshotSourceEndArgs) error { tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if tablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("expected snapshot_source type, not %v", tablet.Type) } if err := agent.Mysqld.SnapshotSourceEnd(args.SlaveStartRequired, args.ReadOnly, true, agent.hookExtraEnv()); err != nil { log.Errorf("SnapshotSourceEnd failed, leaving tablet type alone: %v", err) return err } // change the type back if args.OriginalType == topo.TYPE_MASTER { // force the master update tablet.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, args.OriginalType, make(map[string]string), true /*runHooks*/) } return err }
// ChangeTypeNoRebuild changes a tablet's type, and returns whether // there's a shard that should be rebuilt, along with its cell, // keyspace, and shard. If force is true, it will bypass the RPC action // system and make the data change directly, and not run the remote // hooks. // // Note we don't update the master record in the Shard here, as we // can't ChangeType from and out of master anyway. func (wr *Wrangler) ChangeTypeNoRebuild(tabletAlias topo.TabletAlias, tabletType topo.TabletType, force bool) (rebuildRequired bool, cell, keyspace, shard string, err error) { // Load tablet to find keyspace and shard assignment. // Don't load after the ChangeType which might have unassigned // the tablet. ti, err := wr.ts.GetTablet(tabletAlias) if err != nil { return false, "", "", "", err } if force { if err := topotools.ChangeType(wr.ts, tabletAlias, tabletType, nil, false); err != nil { return false, "", "", "", err } } else { if err := wr.tmc.ChangeType(wr.ctx, ti, tabletType); err != nil { return false, "", "", "", err } } if !ti.Tablet.IsInServingGraph() { // re-read the tablet, see if we become serving ti, err = wr.ts.GetTablet(tabletAlias) if err != nil { return false, "", "", "", err } if !ti.Tablet.IsInServingGraph() { return false, "", "", "", nil } } return true, ti.Alias.Cell, ti.Keyspace, ti.Shard, nil }
// terminateHealthChecks is called when we enter lame duck mode. // We will clean up our state, and shut down query service. // We only do something if we are in targetTabletType state, and then // we just go to spare. func (agent *ActionAgent) terminateHealthChecks(targetTabletType topo.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() log.Info("agent.terminateHealthChecks is starting") // read the current tablet record tablet := agent.Tablet() if tablet.Type != targetTabletType { log.Infof("Tablet in state %v, not changing it", tablet.Type) return } // Change the Type to spare, update the health. Note we pass in a map // that's not nil, meaning we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, topo.TYPE_SPARE, make(map[string]string), true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.rebuildShardIfNeeded(tablet, targetTabletType); err != nil { log.Warningf("rebuildShardIfNeeded failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // We've already rebuilt the shard, which is the only reason we registered // ourself as OnTermSync (synchronous). The rest can be done asynchronously. go func() { // Run the post action callbacks (let them shutdown the query service) if err := agent.refreshTablet(context.TODO(), "terminatehealthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }() }
// Operate on restore tablet. // Check that the SnapshotManifest is valid and the master has not changed. // Shutdown mysqld. // Load the snapshot from source tablet. // Restart mysqld and replication. // Put tablet into the replication graph as a spare. // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Restore(ctx context.Context, args *actionnode.RestoreArgs, logger logutil.Logger) error { // read our current tablet, verify its state tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if args.WasReserved { if tablet.Type != topo.TYPE_RESTORE { return fmt.Errorf("expected restore type, not %v", tablet.Type) } } else { if tablet.Type != topo.TYPE_IDLE { return fmt.Errorf("expected idle type, not %v", tablet.Type) } } // read the source tablet, compute args.SrcFilePath if default sourceTablet, err := agent.TopoServer.GetTablet(args.SrcTabletAlias) if err != nil { return err } if strings.ToLower(args.SrcFilePath) == "default" { args.SrcFilePath = path.Join(mysqlctl.SnapshotURLPath, mysqlctl.SnapshotManifestFile) } // read the parent tablet, verify its state parentTablet, err := agent.TopoServer.GetTablet(args.ParentAlias) if err != nil { return err } if parentTablet.Type != topo.TYPE_MASTER && parentTablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("restore expected master or snapshot_source parent: %v %v", parentTablet.Type, args.ParentAlias) } // read & unpack the manifest sm := new(mysqlctl.SnapshotManifest) if err := fetchAndParseJsonFile(sourceTablet.Addr(), args.SrcFilePath, sm); err != nil { return err } if !args.WasReserved { if err := agent.changeTypeToRestore(ctx, tablet, sourceTablet, parentTablet.Alias, sourceTablet.KeyRange); err != nil { return err } } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // do the work if err := agent.Mysqld.RestoreFromSnapshot(l, sm, args.FetchConcurrency, args.FetchRetryCount, args.DontWaitForSlaveStart, agent.hookExtraEnv()); err != nil { log.Errorf("RestoreFromSnapshot failed (%v), scrapping", err) if err := topotools.Scrap(agent.TopoServer, agent.TabletAlias, false); err != nil { log.Errorf("Failed to Scrap after failed RestoreFromSnapshot: %v", err) } return err } // reload the schema agent.ReloadSchema(ctx) // change to TYPE_SPARE, we're done! return topotools.ChangeType(agent.TopoServer, agent.TabletAlias, topo.TYPE_SPARE, nil, true) }
// Snapshot takes a db snapshot // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Snapshot(ctx context.Context, args *actionnode.SnapshotArgs, logger logutil.Logger) (*actionnode.SnapshotReply, error) { // update our type to TYPE_BACKUP tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return nil, err } originalType := tablet.Type // ForceMasterSnapshot: Normally a master is not a viable tablet // to snapshot. However, there are degenerate cases where you need // to override this, for instance the initial clone of a new master. if tablet.Type == topo.TYPE_MASTER && args.ForceMasterSnapshot { // In this case, we don't bother recomputing the serving graph. // All queries will have to fail anyway. log.Infof("force change type master -> backup") // There is a legitimate reason to force in the case of a single // master. tablet.Tablet.Type = topo.TYPE_BACKUP err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, topo.TYPE_BACKUP, make(map[string]string), true /*runHooks*/) } if err != nil { return nil, err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet(ctx, "snapshotStart"); err != nil { return nil, fmt.Errorf("failed to update state before snaphost: %v", err) } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run the backup filename, slaveStartRequired, readOnly, returnErr := agent.Mysqld.CreateSnapshot(l, tablet.DbName(), tablet.Addr(), false, args.Concurrency, args.ServerMode, agent.hookExtraEnv()) // and change our type to the appropriate value newType := originalType if returnErr != nil { log.Errorf("snapshot failed, restoring tablet type back to %v: %v", newType, returnErr) } else { if args.ServerMode { log.Infof("server mode specified, switching tablet to snapshot_source mode") newType = topo.TYPE_SNAPSHOT_SOURCE } else { log.Infof("change type back after snapshot: %v", newType) } } if tablet.Parent.Uid == topo.NO_TABLET && args.ForceMasterSnapshot && newType != topo.TYPE_SNAPSHOT_SOURCE { log.Infof("force change type backup -> master: %v", tablet.Alias) tablet.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, newType, nil, true /*runHooks*/) } if err != nil { // failure in changing the topology type is probably worse, // so returning that (we logged the snapshot error anyway) returnErr = err } // if anything failed, don't return anything if returnErr != nil { return nil, returnErr } // it all worked, return the required information sr := &actionnode.SnapshotReply{ ManifestPath: filename, SlaveStartRequired: slaveStartRequired, ReadOnly: readOnly, } if tablet.Parent.Uid == topo.NO_TABLET { // If this is a master, this will be the new parent. // FIXME(msolomon) this doesn't work in hierarchical replication. sr.ParentAlias = tablet.Alias } else { sr.ParentAlias = tablet.Parent } return sr, nil }
// ChangeType changes the tablet type // Should be called under RpcWrapLockAction. func (agent *ActionAgent) ChangeType(ctx context.Context, tabletType topo.TabletType) error { return topotools.ChangeType(agent.TopoServer, agent.TabletAlias, tabletType, nil, true /*runHooks*/) }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record and tablet control agent.mutex.Lock() tablet := agent._tablet tabletControl := agent._tabletControl agent.mutex.Unlock() // figure out if we should be running the query service shouldQueryServiceBeRunning := false var blacklistedTables []string if topo.IsRunningQueryService(targetTabletType) && agent.BinlogPlayerMap.size() == 0 { shouldQueryServiceBeRunning = true if tabletControl != nil { blacklistedTables = tabletControl.BlacklistedTables if tabletControl.DisableQueryService { shouldQueryServiceBeRunning = false } } } // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck, shouldQueryServiceBeRunning) // Figure out if we should be running QueryService. If we should, // and we aren't, try to start it (even if we're not healthy, // the reason we might not be healthy is the query service not running!) if shouldQueryServiceBeRunning { if err == nil { // we remember this new possible error err = agent.allowQueries(tablet.Tablet, blacklistedTables) } else { // we ignore the error agent.allowQueries(tablet.Tablet, blacklistedTables) } } // save the health record record := &HealthRecord{ Error: err, Result: health, Time: time.Now(), } agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.Portmap["mysql"]; !ok { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.Mysqld.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) if err := agent.TopoServer.UpdateTabletFields(tablet.Alias, func(tablet *topo.Tablet) error { tablet.Portmap["mysql"] = mysqlPort return nil }); err != nil { log.Infof("Error updating mysql port in tablet record: %v", err) return } // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.Portmap["mysql"] = mysqlPort agent._waitingForMysql = false agent.mutex.Unlock() } } // remember our health status agent.mutex.Lock() agent._healthy = err agent.mutex.Unlock() // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topo.TYPE_SPARE { // we only log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) agent.lastHealthMapCount.Set(int64(len(health))) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.rebuildShardIfNeeded(tablet, targetTabletType); err != nil { log.Warningf("rebuildShardIfNeeded failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // run the post action callbacks, not much we can do with returned error if err := agent.refreshTablet(context.TODO(), "healthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }