// Backup takes a db backup and sends it to the BackupStorage func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger) error { if err := agent.lock(ctx); err != nil { return err } defer agent.unlock() // update our type to BACKUP tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias) if err != nil { return err } if tablet.Type == topodatapb.TabletType_MASTER { return fmt.Errorf("type MASTER cannot take backup, if you really need to do this, restart vttablet in replica mode") } originalType := tablet.Type if _, err := topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, topodatapb.TabletType_BACKUP); err != nil { return err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet(ctx, "before backup"); err != nil { return err } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run the backup dir := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard) name := fmt.Sprintf("%v.%v", time.Now().UTC().Format("2006-01-02.150405"), topoproto.TabletAliasString(tablet.Alias)) returnErr := mysqlctl.Backup(ctx, agent.MysqlDaemon, l, dir, name, concurrency, agent.hookExtraEnv()) // change our type back to the original value _, err = topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, originalType) if err != nil { // failure in changing the topology type is probably worse, // so returning that (we logged the snapshot error anyway) if returnErr != nil { l.Errorf("mysql backup command returned error: %v", returnErr) } returnErr = err } // let's update our internal state (start query service and other things) if err := agent.refreshTablet(ctx, "after backup"); err != nil { return err } // and re-run health check to be sure to capture any replication delay agent.runHealthCheckLocked() return returnErr }
// terminateHealthChecks is called when we enter lame duck mode. // We will clean up our state, and shut down query service. // We only do something if we are in targetTabletType state, and then // we just go to spare. func (agent *ActionAgent) terminateHealthChecks(targetTabletType topo.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() log.Info("agent.terminateHealthChecks is starting") // read the current tablet record tablet := agent.Tablet() if tablet.Type != targetTabletType { log.Infof("Tablet in state %v, not changing it", tablet.Type) return } // Change the Type to spare, update the health. Note we pass in a map // that's not nil, meaning we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, topo.TYPE_SPARE, make(map[string]string), true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.rebuildShardIfNeeded(tablet, targetTabletType); err != nil { log.Warningf("rebuildShardIfNeeded failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // Run the post action callbacks (let them shutdown the query service) if err := agent.refreshTablet("terminatehealthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }
// ChangeTypeNoRebuild changes a tablet's type, and returns whether // there's a shard that should be rebuilt, along with its cell, // keyspace, and shard. If force is true, it will bypass the RPC action // system and make the data change directly, and not run the remote // hooks. // // Note we don't update the master record in the Shard here, as we // can't ChangeType from and out of master anyway. func (wr *Wrangler) ChangeTypeNoRebuild(ctx context.Context, tabletAlias topo.TabletAlias, tabletType topo.TabletType, force bool) (rebuildRequired bool, cell, keyspace, shard string, err error) { // Load tablet to find keyspace and shard assignment. // Don't load after the ChangeType which might have unassigned // the tablet. ti, err := wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return false, "", "", "", err } if force { if err := topotools.ChangeType(ctx, wr.ts, tabletAlias, tabletType, nil); err != nil { return false, "", "", "", err } } else { if err := wr.tmc.ChangeType(ctx, ti, tabletType); err != nil { return false, "", "", "", err } } if !ti.IsInServingGraph() { // re-read the tablet, see if we become serving ti, err = wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return false, "", "", "", err } if !ti.IsInServingGraph() { return false, "", "", "", nil } } return true, ti.Alias.Cell, ti.Keyspace, ti.Shard, nil }
// PromoteSlaveWhenCaughtUp waits for this slave to be caught up on // replication up to the provided point, and then makes the slave the // shard master. func (agent *ActionAgent) PromoteSlaveWhenCaughtUp(ctx context.Context, position string) (string, error) { pos, err := replication.DecodePosition(position) if err != nil { return "", err } if err := agent.MysqlDaemon.WaitMasterPos(ctx, pos); err != nil { return "", err } pos, err = agent.MysqlDaemon.PromoteSlave(agent.hookExtraEnv()) if err != nil { return "", err } // If using semi-sync, we need to enable it before going read-write. if *enableSemiSync { if err := agent.enableSemiSync(true); err != nil { return "", err } } if err := agent.MysqlDaemon.SetReadOnly(false); err != nil { return "", err } if _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER); err != nil { return "", err } return replication.EncodePosition(pos), nil }
// PromoteSlave makes the current tablet the master func (agent *ActionAgent) PromoteSlave(ctx context.Context) (string, error) { if err := agent.lock(ctx); err != nil { return "", err } defer agent.unlock() pos, err := agent.MysqlDaemon.PromoteSlave(agent.hookExtraEnv()) if err != nil { return "", err } // If using semi-sync, we need to enable it before going read-write. if *enableSemiSync { if err := agent.enableSemiSync(true); err != nil { return "", err } } // Set the server read-write if err := agent.MysqlDaemon.SetReadOnly(false); err != nil { return "", err } if _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER); err != nil { return "", err } if err := agent.refreshTablet(ctx, "PromoteSlave"); err != nil { return "", err } return replication.EncodePosition(pos), nil }
// PromoteSlaveWhenCaughtUp waits for this slave to be caught up on // replication up to the provided point, and then makes the slave the // shard master. func (agent *ActionAgent) PromoteSlaveWhenCaughtUp(ctx context.Context, position string) (string, error) { pos, err := replication.DecodePosition(position) if err != nil { return "", err } // TODO(alainjobart) change the flavor API to take the context directly // For now, extract the timeout from the context, or wait forever var waitTimeout time.Duration if deadline, ok := ctx.Deadline(); ok { waitTimeout = deadline.Sub(time.Now()) if waitTimeout <= 0 { waitTimeout = time.Millisecond } } if err := agent.MysqlDaemon.WaitMasterPos(pos, waitTimeout); err != nil { return "", err } pos, err = agent.MysqlDaemon.PromoteSlave(agent.hookExtraEnv()) if err != nil { return "", err } if err := agent.MysqlDaemon.SetReadOnly(false); err != nil { return "", err } if _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER, topotools.ClearHealthMap); err != nil { return "", err } return replication.EncodePosition(pos), nil }
// SnapshotSourceEnd restores the state of the server after a // Snapshot(server_mode =true) // Should be called under RpcWrapLockAction. func (agent *ActionAgent) SnapshotSourceEnd(args *actionnode.SnapshotSourceEndArgs) error { tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if tablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("expected snapshot_source type, not %v", tablet.Type) } if err := agent.Mysqld.SnapshotSourceEnd(args.SlaveStartRequired, args.ReadOnly, true, agent.hookExtraEnv()); err != nil { log.Errorf("SnapshotSourceEnd failed, leaving tablet type alone: %v", err) return err } // change the type back if args.OriginalType == topo.TYPE_MASTER { // force the master update tablet.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, args.OriginalType, make(map[string]string), true /*runHooks*/) } return err }
// terminateHealthChecks is called when we enter lame duck mode. // We will clean up our state, and shut down query service. // We only do something if we are in targetTabletType state, and then // we just go to spare. func (agent *ActionAgent) terminateHealthChecks(targetTabletType pbt.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() log.Info("agent.terminateHealthChecks is starting") // read the current tablet record tablet := agent.Tablet() if tablet.Type != targetTabletType { log.Infof("Tablet in state %v, not changing it", tablet.Type) return } // Change the Type to spare, update the health. Note we pass in a map // that's not nil, meaning we will clear it. if err := topotools.ChangeType(agent.batchCtx, agent.TopoServer, tablet.Alias, pbt.TabletType_SPARE, make(map[string]string)); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Update the serving graph in our cell, only if we're dealing with // a serving type if err := agent.updateServingGraph(tablet, targetTabletType); err != nil { log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // We've already rebuilt the shard, which is the only reason we registered // ourself as OnTermSync (synchronous). The rest can be done asynchronously. go func() { // Run the post action callbacks (let them shutdown the query service) if err := agent.refreshTablet(agent.batchCtx, "terminatehealthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }() }
// ChangeType changes the tablet type // Should be called under RPCWrapLockAction. func (agent *ActionAgent) ChangeType(ctx context.Context, tabletType topodatapb.TabletType) error { _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, tabletType) if err != nil { return err } agent.runHealthCheckProtected() return nil }
// Backup takes a db backup and sends it to the BackupStorage // Should be called under RPCWrapLockAction. func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger) error { // update our type to BACKUP tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias) if err != nil { return err } if tablet.Type == topodatapb.TabletType_MASTER { return fmt.Errorf("type MASTER cannot take backup, if you really need to do this, restart vttablet in replica mode") } originalType := tablet.Type if _, err := topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, topodatapb.TabletType_BACKUP, make(map[string]string)); err != nil { return err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet(ctx, "backup"); err != nil { return fmt.Errorf("failed to update state before backup: %v", err) } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run the backup dir := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard) name := fmt.Sprintf("%v.%v", time.Now().UTC().Format("2006-01-02.150405"), topoproto.TabletAliasString(tablet.Alias)) returnErr := mysqlctl.Backup(ctx, agent.MysqlDaemon, l, dir, name, concurrency, agent.hookExtraEnv()) // and change our type back to the appropriate value: // - if healthcheck is enabled, go to spare // - if not, go back to original type if agent.IsRunningHealthCheck() { originalType = topodatapb.TabletType_SPARE } _, err = topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, originalType, nil) if err != nil { // failure in changing the topology type is probably worse, // so returning that (we logged the snapshot error anyway) if returnErr != nil { l.Errorf("mysql backup command returned error: %v", returnErr) } returnErr = err } return returnErr }
// SlaveWasPromoted promotes a slave to master, no questions asked. func (agent *ActionAgent) SlaveWasPromoted(ctx context.Context) error { if err := agent.lock(ctx); err != nil { return err } defer agent.unlock() if _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER); err != nil { return err } if err := agent.refreshTablet(ctx, "SlaveWasPromoted"); err != nil { return err } return nil }
// ChangeTypeNoRebuild changes a tablet's type, and returns whether // there's a shard that should be rebuilt, along with its cell, // keyspace, and shard. If force is true, it will bypass the vtaction // system and make the data change directly, and not run the remote // hooks. // // Note we don't update the master record in the Shard here, as we // can't ChangeType from and out of master anyway. func (wr *Wrangler) ChangeTypeNoRebuild(tabletAlias topo.TabletAlias, tabletType topo.TabletType, force bool) (rebuildRequired bool, cell, keyspace, shard string, err error) { // Load tablet to find keyspace and shard assignment. // Don't load after the ChangeType which might have unassigned // the tablet. ti, err := wr.ts.GetTablet(tabletAlias) if err != nil { return false, "", "", "", err } if force { if err := topotools.ChangeType(wr.ts, tabletAlias, tabletType, nil, false); err != nil { return false, "", "", "", err } } else { if wr.UseRPCs { if err := wr.ai.RpcChangeType(ti, tabletType, wr.ActionTimeout()); err != nil { return false, "", "", "", err } } else { // the remote action will run the hooks actionPath, err := wr.ai.ChangeType(tabletAlias, tabletType) if err != nil { return false, "", "", "", err } // You don't have a choice - you must wait for // completion before rebuilding. if err := wr.WaitForCompletion(actionPath); err != nil { return false, "", "", "", err } } } if !ti.Tablet.IsInServingGraph() { // re-read the tablet, see if we become serving ti, err = wr.ts.GetTablet(tabletAlias) if err != nil { return false, "", "", "", err } if !ti.Tablet.IsInServingGraph() { return false, "", "", "", nil } } return true, ti.Alias.Cell, ti.Keyspace, ti.Shard, nil }
// PromoteSlave makes the current tablet the master func (agent *ActionAgent) PromoteSlave(ctx context.Context) (string, error) { pos, err := agent.MysqlDaemon.PromoteSlave(agent.hookExtraEnv()) if err != nil { return "", err } // Set the server read-write if err := agent.MysqlDaemon.SetReadOnly(false); err != nil { return "", err } if _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER, topotools.ClearHealthMap); err != nil { return "", err } return replication.EncodePosition(pos), nil }
// ChangeType changes the tablet type func (agent *ActionAgent) ChangeType(ctx context.Context, tabletType topodatapb.TabletType) error { if err := agent.lock(ctx); err != nil { return err } defer agent.unlock() // change our type in the topology _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, tabletType) if err != nil { return err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet(ctx, "ChangeType"); err != nil { return err } // and re-run health check agent.runHealthCheckLocked() return nil }
// RunHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) RunHealthCheck(targetTabletType topo.TabletType, lockTimeout time.Duration) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record agent.mutex.Lock() tablet := agent._tablet agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) // start with no change newTabletType := tablet.Type if err != nil { if tablet.Type != targetTabletType { log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) return } log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if topo.IsInServingGraph(targetTabletType) { // TODO: interrupted may need to be a global one closed when we exit interrupted := make(chan struct{}) if *topotools.UseSrvShardLocks { // no need to take the shard lock in this case if err := topotools.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topotools.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}, lockTimeout, interrupted); err != nil { log.Warningf("topotools.RebuildShard returned an error: %v", err) return } } else { actionNode := actionnode.RebuildShard() lockPath, err := actionNode.LockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockTimeout, interrupted) if err != nil { log.Warningf("Cannot lock shard for rebuild: %v", err) return } err = topotools.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topotools.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}, lockTimeout, interrupted) err = actionNode.UnlockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockPath, err) if err != nil { log.Warningf("UnlockShard returned an error: %v", err) return } } } // run the post action callbacks agent.afterAction("healthcheck", false /* reloadSchema */) }
// Operate on restore tablet. // Check that the SnapshotManifest is valid and the master has not changed. // Shutdown mysqld. // Load the snapshot from source tablet. // Restart mysqld and replication. // Put tablet into the replication graph as a spare. // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Restore(args *actionnode.RestoreArgs, logger logutil.Logger) error { // read our current tablet, verify its state tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if args.WasReserved { if tablet.Type != topo.TYPE_RESTORE { return fmt.Errorf("expected restore type, not %v", tablet.Type) } } else { if tablet.Type != topo.TYPE_IDLE { return fmt.Errorf("expected idle type, not %v", tablet.Type) } } // read the source tablet, compute args.SrcFilePath if default sourceTablet, err := agent.TopoServer.GetTablet(args.SrcTabletAlias) if err != nil { return err } if strings.ToLower(args.SrcFilePath) == "default" { args.SrcFilePath = path.Join(mysqlctl.SnapshotURLPath, mysqlctl.SnapshotManifestFile) } // read the parent tablet, verify its state parentTablet, err := agent.TopoServer.GetTablet(args.ParentAlias) if err != nil { return err } if parentTablet.Type != topo.TYPE_MASTER && parentTablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("restore expected master or snapshot_source parent: %v %v", parentTablet.Type, args.ParentAlias) } // read & unpack the manifest sm := new(mysqlctl.SnapshotManifest) if err := fetchAndParseJsonFile(sourceTablet.Addr(), args.SrcFilePath, sm); err != nil { return err } if !args.WasReserved { if err := agent.changeTypeToRestore(tablet, sourceTablet, parentTablet.Alias, sourceTablet.KeyRange); err != nil { return err } } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // do the work if err := agent.Mysqld.RestoreFromSnapshot(l, sm, args.FetchConcurrency, args.FetchRetryCount, args.DontWaitForSlaveStart, agent.hookExtraEnv()); err != nil { log.Errorf("RestoreFromSnapshot failed (%v), scrapping", err) if err := topotools.Scrap(agent.TopoServer, agent.TabletAlias, false); err != nil { log.Errorf("Failed to Scrap after failed RestoreFromSnapshot: %v", err) } return err } // reload the schema agent.ReloadSchema() // change to TYPE_SPARE, we're done! return topotools.ChangeType(agent.TopoServer, agent.TabletAlias, topo.TYPE_SPARE, nil, true) }
// Snapshot takes a db snapshot // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Snapshot(args *actionnode.SnapshotArgs, logger logutil.Logger) (*actionnode.SnapshotReply, error) { // update our type to TYPE_BACKUP tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return nil, err } originalType := tablet.Type // ForceMasterSnapshot: Normally a master is not a viable tablet // to snapshot. However, there are degenerate cases where you need // to override this, for instance the initial clone of a new master. if tablet.Type == topo.TYPE_MASTER && args.ForceMasterSnapshot { // In this case, we don't bother recomputing the serving graph. // All queries will have to fail anyway. log.Infof("force change type master -> backup") // There is a legitimate reason to force in the case of a single // master. tablet.Tablet.Type = topo.TYPE_BACKUP err = topo.UpdateTablet(agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, topo.TYPE_BACKUP, make(map[string]string), true /*runHooks*/) } if err != nil { return nil, err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet("snapshotStart"); err != nil { return nil, fmt.Errorf("failed to update state before snaphost: %v", err) } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run the backup filename, slaveStartRequired, readOnly, returnErr := agent.Mysqld.CreateSnapshot(l, tablet.DbName(), tablet.Addr(), false, args.Concurrency, args.ServerMode, agent.hookExtraEnv()) // and change our type to the appropriate value newType := originalType if returnErr != nil { log.Errorf("snapshot failed, restoring tablet type back to %v: %v", newType, returnErr) } else { if args.ServerMode { log.Infof("server mode specified, switching tablet to snapshot_source mode") newType = topo.TYPE_SNAPSHOT_SOURCE } else { log.Infof("change type back after snapshot: %v", newType) } } if tablet.Parent.Uid == topo.NO_TABLET && args.ForceMasterSnapshot && newType != topo.TYPE_SNAPSHOT_SOURCE { log.Infof("force change type backup -> master: %v", tablet.Alias) tablet.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, newType, nil, true /*runHooks*/) } if err != nil { // failure in changing the topology type is probably worse, // so returning that (we logged the snapshot error anyway) returnErr = err } // if anything failed, don't return anything if returnErr != nil { return nil, returnErr } // it all worked, return the required information sr := &actionnode.SnapshotReply{ ManifestPath: filename, SlaveStartRequired: slaveStartRequired, ReadOnly: readOnly, } if tablet.Parent.Uid == topo.NO_TABLET { // If this is a master, this will be the new parent. // FIXME(msolomon) this doesn't work in hierarchical replication. sr.ParentAlias = tablet.Alias } else { sr.ParentAlias = tablet.Parent } return sr, nil }
func (tm *TabletManager) ChangeType(context *rpcproto.Context, args *topo.TabletType, reply *rpc.UnusedResponse) error { return tm.agent.RpcWrapLockAction(context.RemoteAddr, actionnode.TABLET_ACTION_CHANGE_TYPE, args, reply, func() error { return topotools.ChangeType(tm.agent.TopoServer, tm.agent.TabletAlias, *args, nil, true /*runHooks*/) }) }
// SlaveWasPromoted promotes a slave to master, no questions asked. // Should be called under RPCWrapLockAction. func (agent *ActionAgent) SlaveWasPromoted(ctx context.Context) error { _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER, topotools.ClearHealthMap) return err }
// ChangeType changes the tablet type // Should be called under RPCWrapLockAction. func (agent *ActionAgent) ChangeType(ctx context.Context, tabletType topodatapb.TabletType) error { _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, tabletType, nil) return err }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record and blacklisted tables agent.mutex.Lock() tablet := agent._tablet blacklistedTables := agent._blacklistedTables agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) // Figure out if we should be running QueryService. If we should, // and we aren't, and we're otherwise healthy, try to start it. if err == nil && topo.IsRunningQueryService(targetTabletType) && agent.BinlogPlayerMap.size() == 0 { err = agent.allowQueries(tablet.Tablet, blacklistedTables) } // save the health record record := &HealthRecord{ Error: err, Result: health, Time: time.Now(), } agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.Portmap["mysql"]; !ok { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.Mysqld.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) if err := agent.TopoServer.UpdateTabletFields(tablet.Alias, func(tablet *topo.Tablet) error { tablet.Portmap["mysql"] = mysqlPort return nil }); err != nil { log.Infof("Error updating mysql port in tablet record: %v", err) return } // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.Portmap["mysql"] = mysqlPort agent._waitingForMysql = false agent.mutex.Unlock() } } // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topo.TYPE_SPARE { // we only log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) agent.lastHealthMapCount.Set(int64(len(health))) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.rebuildShardIfNeeded(tablet, targetTabletType); err != nil { log.Warningf("rebuildShardIfNeeded failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // run the post action callbacks, not much we can do with returned error if err := agent.refreshTablet("healthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }
// Operate on restore tablet. // Check that the SnapshotManifest is valid and the master has not changed. // Shutdown mysqld. // Load the snapshot from source tablet. // Restart mysqld and replication. // Put tablet into the replication graph as a spare. func (ta *TabletActor) restore(actionNode *actionnode.ActionNode) error { args := actionNode.Args.(*actionnode.RestoreArgs) // read our current tablet, verify its state tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } if args.WasReserved { if tablet.Type != topo.TYPE_RESTORE { return fmt.Errorf("expected restore type, not %v: %v", tablet.Type, ta.tabletAlias) } } else { if tablet.Type != topo.TYPE_IDLE { return fmt.Errorf("expected idle type, not %v: %v", tablet.Type, ta.tabletAlias) } } // read the source tablet, compute args.SrcFilePath if default sourceTablet, err := ta.ts.GetTablet(args.SrcTabletAlias) if err != nil { return err } if strings.ToLower(args.SrcFilePath) == "default" { args.SrcFilePath = path.Join(mysqlctl.SnapshotURLPath, mysqlctl.SnapshotManifestFile) } // read the parent tablet, verify its state parentTablet, err := ta.ts.GetTablet(args.ParentAlias) if err != nil { return err } if parentTablet.Type != topo.TYPE_MASTER && parentTablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("restore expected master or snapshot_source parent: %v %v", parentTablet.Type, args.ParentAlias) } // read & unpack the manifest sm := new(mysqlctl.SnapshotManifest) if err := fetchAndParseJsonFile(sourceTablet.Addr(), args.SrcFilePath, sm); err != nil { return err } if !args.WasReserved { if err := ta.changeTypeToRestore(tablet, sourceTablet, parentTablet.Alias, sourceTablet.KeyRange); err != nil { return err } } // do the work if err := ta.mysqld.RestoreFromSnapshot(sm, args.FetchConcurrency, args.FetchRetryCount, args.DontWaitForSlaveStart, ta.hookExtraEnv()); err != nil { log.Errorf("RestoreFromSnapshot failed (%v), scrapping", err) if err := topotools.Scrap(ta.ts, ta.tabletAlias, false); err != nil { log.Errorf("Failed to Scrap after failed RestoreFromSnapshot: %v", err) } return err } // change to TYPE_SPARE, we're done! return topotools.ChangeType(ta.ts, ta.tabletAlias, topo.TYPE_SPARE, nil, true) }
func (ta *TabletActor) changeType(actionNode *actionnode.ActionNode) error { dbType := actionNode.Args.(*topo.TabletType) return topotools.ChangeType(ta.ts, ta.tabletAlias, *dbType, nil, true /*runHooks*/) }
// ChangeType changes the tablet type // Should be called under RpcWrapLockAction. func (agent *ActionAgent) ChangeType(tabletType topo.TabletType) error { return topotools.ChangeType(agent.TopoServer, agent.TabletAlias, tabletType, nil, true /*runHooks*/) }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType pbt.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record and tablet control agent.mutex.Lock() tablet := agent._tablet tabletControl := agent._tabletControl agent.mutex.Unlock() // figure out if we should be running the query service shouldQueryServiceBeRunning := false var blacklistedTables []string if topo.IsRunningQueryService(targetTabletType) && !agent.BinlogPlayerMap.isRunningFilteredReplication() { shouldQueryServiceBeRunning = true if tabletControl != nil { blacklistedTables = tabletControl.BlacklistedTables if tabletControl.DisableQueryService { shouldQueryServiceBeRunning = false } } } // run the health check typeForHealthCheck := targetTabletType if tablet.Type == pbt.TabletType_MASTER { typeForHealthCheck = pbt.TabletType_MASTER } replicationDelay, err := agent.HealthReporter.Report(topo.IsSlaveType(typeForHealthCheck), shouldQueryServiceBeRunning) health := make(map[string]string) if err == nil { if replicationDelay > *unhealthyThreshold { err = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds()) } else if replicationDelay > *degradedThreshold { health[topo.ReplicationLag] = topo.ReplicationLagHigh } } agent.lastHealthMapCount.Set(int64(len(health))) // Figure out if we should be running QueryService, see if we are, // and reconcile. if err != nil { if tablet.Type != pbt.TabletType_WORKER { // We are not healthy and must shut down QueryService. // At the moment, the only exception to this are "worker" tablets which // still must serve queries e.g. as source tablet during a "SplitClone". shouldQueryServiceBeRunning = false } } isQueryServiceRunning := agent.QueryServiceControl.IsServing() if shouldQueryServiceBeRunning { if !isQueryServiceRunning { // send the type we want to be, not the type we are currentType := tablet.Type if tablet.Type == pbt.TabletType_SPARE { tablet.Type = targetTabletType } // we remember this new possible error err = agent.allowQueries(tablet.Tablet, blacklistedTables) // restore the current type tablet.Type = currentType } } else { if isQueryServiceRunning { // we are not healthy or should not be running the // query service, shut it down. agent.stopQueryService() } } // save the health record record := &HealthRecord{ Error: err, ReplicationDelay: replicationDelay, Time: time.Now(), } agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.PortMap["mysql"]; !ok { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.MysqlDaemon.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) if err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *pbt.Tablet) error { tablet.PortMap["mysql"] = int32(mysqlPort) return nil }); err != nil { log.Infof("Error updating mysql port in tablet record: %v", err) return } // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.PortMap["mysql"] = int32(mysqlPort) agent._waitingForMysql = false agent.mutex.Unlock() } } // remember our health status agent.mutex.Lock() agent._healthy = err agent._healthyTime = time.Now() agent._replicationDelay = replicationDelay terTime := agent._tabletExternallyReparentedTime agent.mutex.Unlock() // send it to our observers // (the Target has already been updated when restarting the // query service earlier) // FIXME(alainjobart,liguo) add CpuUsage stats := &pb.RealtimeStats{ SecondsBehindMaster: uint32(replicationDelay.Seconds()), } stats.SecondsBehindMasterFilteredReplication, stats.BinlogPlayersCount = agent.BinlogPlayerMap.StatusSummary() if err != nil { stats.HealthError = err.Error() } defer func() { var ts int64 if !terTime.IsZero() { ts = terTime.Unix() } agent.QueryServiceControl.BroadcastHealth(ts, stats) }() // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != pbt.TabletType_SPARE { // we only log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = pbt.TabletType_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == pbt.TabletType_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && topo.IsHealthEqual(health, tablet.HealthMap) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.HealthMap, health) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.batchCtx, agent.TopoServer, tablet.Alias, newTabletType, health); err != nil { log.Infof("Error updating tablet record: %v", err) return } tablet.HealthMap = health tablet.Type = newTabletType // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.updateServingGraph(tablet, targetTabletType); err != nil { log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // run the post action callbacks, not much we can do with returned error if err := agent.refreshTablet(agent.batchCtx, "healthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }
// ChangeType changes the tablet type // Should be called under RPCWrapLockAction. func (agent *ActionAgent) ChangeType(ctx context.Context, tabletType pb.TabletType) error { return topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, tabletType, nil) }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType, lockTimeout time.Duration) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record agent.mutex.Lock() tablet := agent._tablet agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) // Figure out if we should be running QueryService. If we should, // and we aren't, and we're otherwise healthy, try to start it if err == nil && topo.IsRunningQueryService(targetTabletType) { err = agent.allowQueries(tablet.Tablet) } // save the health record record := &HealthRecord{ Error: err, Result: health, Time: time.Now(), } agent.History.Add(record) // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topo.TYPE_SPARE { // we onyl log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) agent.lastHealthMapCount.Set(int64(len(health))) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.rebuildShardIfNeeded(tablet, targetTabletType, lockTimeout); err != nil { log.Warningf("rebuildShardIfNeeded failed, not running post action callbacks: %v", err) return } // run the post action callbacks agent.afterAction("healthcheck", false /* reloadSchema */) }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record and tablet control agent.mutex.Lock() tablet := agent._tablet tabletControl := agent._tabletControl agent.mutex.Unlock() // figure out if we should be running the query service shouldQueryServiceBeRunning := false var blacklistedTables []string if topo.IsRunningQueryService(targetTabletType) && agent.BinlogPlayerMap.size() == 0 { shouldQueryServiceBeRunning = true if tabletControl != nil { blacklistedTables = tabletControl.BlacklistedTables if tabletControl.DisableQueryService { shouldQueryServiceBeRunning = false } } } // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } replicationDelay, err := agent.HealthReporter.Report(topo.IsSlaveType(typeForHealthCheck), shouldQueryServiceBeRunning) health := make(map[string]string) if err == nil { if replicationDelay > *unhealthyThreshold { err = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds()) } else if replicationDelay > *degradedThreshold { health[topo.ReplicationLag] = topo.ReplicationLagHigh } } // Figure out if we should be running QueryService, see if we are, // and reconcile. if err != nil { // we are not healthy, we should not be running QueryService shouldQueryServiceBeRunning = false } isQueryServiceRunning := agent.QueryServiceControl.IsServing() if shouldQueryServiceBeRunning { if !isQueryServiceRunning { // we remember this new possible error err = agent.allowQueries(tablet.Tablet, blacklistedTables) } } else { if isQueryServiceRunning { // we are not healthy or should not be running the // query service, shut it down. agent.disallowQueries() } } // save the health record record := &HealthRecord{ Error: err, ReplicationDelay: replicationDelay, Time: time.Now(), } agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.Portmap["mysql"]; !ok { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.MysqlDaemon.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) if err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *topo.Tablet) error { tablet.Portmap["mysql"] = mysqlPort return nil }); err != nil { log.Infof("Error updating mysql port in tablet record: %v", err) return } // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.Portmap["mysql"] = mysqlPort agent._waitingForMysql = false agent.mutex.Unlock() } } // remember our health status agent.mutex.Lock() agent._healthy = err agent._healthyTime = time.Now() agent._replicationDelay = replicationDelay agent.mutex.Unlock() // send it to our observers, after we've updated the tablet state // (Tablet is a pointer, and below we will alter the Tablet // record to be correct. hsr := &actionnode.HealthStreamReply{ Tablet: tablet.Tablet, BinlogPlayerMapSize: agent.BinlogPlayerMap.size(), ReplicationDelay: replicationDelay, } if err != nil { hsr.HealthError = err.Error() } defer agent.BroadcastHealthStreamReply(hsr) // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topo.TYPE_SPARE { // we only log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) agent.lastHealthMapCount.Set(int64(len(health))) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.batchCtx, agent.TopoServer, tablet.Alias, newTabletType, health); err != nil { log.Infof("Error updating tablet record: %v", err) return } tablet.Health = health tablet.Type = newTabletType // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.updateServingGraph(tablet, targetTabletType); err != nil { log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // run the post action callbacks, not much we can do with returned error if err := agent.refreshTablet(agent.batchCtx, "healthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }