Exemplo n.º 1
0
// Backup takes a db backup and sends it to the BackupStorage
func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger) error {
	if err := agent.lock(ctx); err != nil {
		return err
	}
	defer agent.unlock()

	// update our type to BACKUP
	tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias)
	if err != nil {
		return err
	}
	if tablet.Type == topodatapb.TabletType_MASTER {
		return fmt.Errorf("type MASTER cannot take backup, if you really need to do this, restart vttablet in replica mode")
	}
	originalType := tablet.Type
	if _, err := topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, topodatapb.TabletType_BACKUP); err != nil {
		return err
	}

	// let's update our internal state (stop query service and other things)
	if err := agent.refreshTablet(ctx, "before backup"); err != nil {
		return err
	}

	// create the loggers: tee to console and source
	l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger)

	// now we can run the backup
	dir := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard)
	name := fmt.Sprintf("%v.%v", time.Now().UTC().Format("2006-01-02.150405"), topoproto.TabletAliasString(tablet.Alias))
	returnErr := mysqlctl.Backup(ctx, agent.MysqlDaemon, l, dir, name, concurrency, agent.hookExtraEnv())

	// change our type back to the original value
	_, err = topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, originalType)
	if err != nil {
		// failure in changing the topology type is probably worse,
		// so returning that (we logged the snapshot error anyway)
		if returnErr != nil {
			l.Errorf("mysql backup command returned error: %v", returnErr)
		}
		returnErr = err
	}

	// let's update our internal state (start query service and other things)
	if err := agent.refreshTablet(ctx, "after backup"); err != nil {
		return err
	}

	// and re-run health check to be sure to capture any replication delay
	agent.runHealthCheckLocked()

	return returnErr
}
Exemplo n.º 2
0
// terminateHealthChecks is called when we enter lame duck mode.
// We will clean up our state, and shut down query service.
// We only do something if we are in targetTabletType state, and then
// we just go to spare.
func (agent *ActionAgent) terminateHealthChecks(targetTabletType topo.TabletType) {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()
	log.Info("agent.terminateHealthChecks is starting")

	// read the current tablet record
	tablet := agent.Tablet()
	if tablet.Type != targetTabletType {
		log.Infof("Tablet in state %v, not changing it", tablet.Type)
		return
	}

	// Change the Type to spare, update the health. Note we pass in a map
	// that's not nil, meaning we will clear it.
	if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, topo.TYPE_SPARE, make(map[string]string), true /*runHooks*/); err != nil {
		log.Infof("Error updating tablet record: %v", err)
		return
	}

	// Rebuild the serving graph in our cell, only if we're dealing with
	// a serving type
	if err := agent.rebuildShardIfNeeded(tablet, targetTabletType); err != nil {
		log.Warningf("rebuildShardIfNeeded failed (will still run post action callbacks, serving graph might be out of date): %v", err)
	}

	// Run the post action callbacks (let them shutdown the query service)
	if err := agent.refreshTablet("terminatehealthcheck"); err != nil {
		log.Warningf("refreshTablet failed: %v", err)
	}
}
Exemplo n.º 3
0
// ChangeTypeNoRebuild changes a tablet's type, and returns whether
// there's a shard that should be rebuilt, along with its cell,
// keyspace, and shard. If force is true, it will bypass the RPC action
// system and make the data change directly, and not run the remote
// hooks.
//
// Note we don't update the master record in the Shard here, as we
// can't ChangeType from and out of master anyway.
func (wr *Wrangler) ChangeTypeNoRebuild(ctx context.Context, tabletAlias topo.TabletAlias, tabletType topo.TabletType, force bool) (rebuildRequired bool, cell, keyspace, shard string, err error) {
	// Load tablet to find keyspace and shard assignment.
	// Don't load after the ChangeType which might have unassigned
	// the tablet.
	ti, err := wr.ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		return false, "", "", "", err
	}

	if force {
		if err := topotools.ChangeType(ctx, wr.ts, tabletAlias, tabletType, nil); err != nil {
			return false, "", "", "", err
		}
	} else {
		if err := wr.tmc.ChangeType(ctx, ti, tabletType); err != nil {
			return false, "", "", "", err
		}
	}

	if !ti.IsInServingGraph() {
		// re-read the tablet, see if we become serving
		ti, err = wr.ts.GetTablet(ctx, tabletAlias)
		if err != nil {
			return false, "", "", "", err
		}
		if !ti.IsInServingGraph() {
			return false, "", "", "", nil
		}
	}
	return true, ti.Alias.Cell, ti.Keyspace, ti.Shard, nil

}
Exemplo n.º 4
0
// PromoteSlaveWhenCaughtUp waits for this slave to be caught up on
// replication up to the provided point, and then makes the slave the
// shard master.
func (agent *ActionAgent) PromoteSlaveWhenCaughtUp(ctx context.Context, position string) (string, error) {
	pos, err := replication.DecodePosition(position)
	if err != nil {
		return "", err
	}

	if err := agent.MysqlDaemon.WaitMasterPos(ctx, pos); err != nil {
		return "", err
	}

	pos, err = agent.MysqlDaemon.PromoteSlave(agent.hookExtraEnv())
	if err != nil {
		return "", err
	}

	// If using semi-sync, we need to enable it before going read-write.
	if *enableSemiSync {
		if err := agent.enableSemiSync(true); err != nil {
			return "", err
		}
	}

	if err := agent.MysqlDaemon.SetReadOnly(false); err != nil {
		return "", err
	}

	if _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER); err != nil {
		return "", err
	}

	return replication.EncodePosition(pos), nil
}
Exemplo n.º 5
0
// PromoteSlave makes the current tablet the master
func (agent *ActionAgent) PromoteSlave(ctx context.Context) (string, error) {
	if err := agent.lock(ctx); err != nil {
		return "", err
	}
	defer agent.unlock()

	pos, err := agent.MysqlDaemon.PromoteSlave(agent.hookExtraEnv())
	if err != nil {
		return "", err
	}

	// If using semi-sync, we need to enable it before going read-write.
	if *enableSemiSync {
		if err := agent.enableSemiSync(true); err != nil {
			return "", err
		}
	}

	// Set the server read-write
	if err := agent.MysqlDaemon.SetReadOnly(false); err != nil {
		return "", err
	}

	if _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER); err != nil {
		return "", err
	}

	if err := agent.refreshTablet(ctx, "PromoteSlave"); err != nil {
		return "", err
	}

	return replication.EncodePosition(pos), nil
}
Exemplo n.º 6
0
// PromoteSlaveWhenCaughtUp waits for this slave to be caught up on
// replication up to the provided point, and then makes the slave the
// shard master.
func (agent *ActionAgent) PromoteSlaveWhenCaughtUp(ctx context.Context, position string) (string, error) {
	pos, err := replication.DecodePosition(position)
	if err != nil {
		return "", err
	}

	// TODO(alainjobart) change the flavor API to take the context directly
	// For now, extract the timeout from the context, or wait forever
	var waitTimeout time.Duration
	if deadline, ok := ctx.Deadline(); ok {
		waitTimeout = deadline.Sub(time.Now())
		if waitTimeout <= 0 {
			waitTimeout = time.Millisecond
		}
	}
	if err := agent.MysqlDaemon.WaitMasterPos(pos, waitTimeout); err != nil {
		return "", err
	}

	pos, err = agent.MysqlDaemon.PromoteSlave(agent.hookExtraEnv())
	if err != nil {
		return "", err
	}

	if err := agent.MysqlDaemon.SetReadOnly(false); err != nil {
		return "", err
	}

	if _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER, topotools.ClearHealthMap); err != nil {
		return "", err
	}

	return replication.EncodePosition(pos), nil
}
Exemplo n.º 7
0
// SnapshotSourceEnd restores the state of the server after a
// Snapshot(server_mode =true)
// Should be called under RpcWrapLockAction.
func (agent *ActionAgent) SnapshotSourceEnd(args *actionnode.SnapshotSourceEndArgs) error {
	tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias)
	if err != nil {
		return err
	}
	if tablet.Type != topo.TYPE_SNAPSHOT_SOURCE {
		return fmt.Errorf("expected snapshot_source type, not %v", tablet.Type)
	}

	if err := agent.Mysqld.SnapshotSourceEnd(args.SlaveStartRequired, args.ReadOnly, true, agent.hookExtraEnv()); err != nil {
		log.Errorf("SnapshotSourceEnd failed, leaving tablet type alone: %v", err)
		return err
	}

	// change the type back
	if args.OriginalType == topo.TYPE_MASTER {
		// force the master update
		tablet.Tablet.Type = topo.TYPE_MASTER
		err = topo.UpdateTablet(agent.TopoServer, tablet)
	} else {
		err = topotools.ChangeType(agent.TopoServer, tablet.Alias, args.OriginalType, make(map[string]string), true /*runHooks*/)
	}

	return err
}
Exemplo n.º 8
0
// terminateHealthChecks is called when we enter lame duck mode.
// We will clean up our state, and shut down query service.
// We only do something if we are in targetTabletType state, and then
// we just go to spare.
func (agent *ActionAgent) terminateHealthChecks(targetTabletType pbt.TabletType) {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()
	log.Info("agent.terminateHealthChecks is starting")

	// read the current tablet record
	tablet := agent.Tablet()
	if tablet.Type != targetTabletType {
		log.Infof("Tablet in state %v, not changing it", tablet.Type)
		return
	}

	// Change the Type to spare, update the health. Note we pass in a map
	// that's not nil, meaning we will clear it.
	if err := topotools.ChangeType(agent.batchCtx, agent.TopoServer, tablet.Alias, pbt.TabletType_SPARE, make(map[string]string)); err != nil {
		log.Infof("Error updating tablet record: %v", err)
		return
	}

	// Update the serving graph in our cell, only if we're dealing with
	// a serving type
	if err := agent.updateServingGraph(tablet, targetTabletType); err != nil {
		log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err)
	}

	// We've already rebuilt the shard, which is the only reason we registered
	// ourself as OnTermSync (synchronous). The rest can be done asynchronously.
	go func() {
		// Run the post action callbacks (let them shutdown the query service)
		if err := agent.refreshTablet(agent.batchCtx, "terminatehealthcheck"); err != nil {
			log.Warningf("refreshTablet failed: %v", err)
		}
	}()
}
Exemplo n.º 9
0
// ChangeType changes the tablet type
// Should be called under RPCWrapLockAction.
func (agent *ActionAgent) ChangeType(ctx context.Context, tabletType topodatapb.TabletType) error {
	_, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, tabletType)
	if err != nil {
		return err
	}
	agent.runHealthCheckProtected()
	return nil
}
Exemplo n.º 10
0
// Backup takes a db backup and sends it to the BackupStorage
// Should be called under RPCWrapLockAction.
func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger) error {
	// update our type to BACKUP
	tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias)
	if err != nil {
		return err
	}
	if tablet.Type == topodatapb.TabletType_MASTER {
		return fmt.Errorf("type MASTER cannot take backup, if you really need to do this, restart vttablet in replica mode")
	}
	originalType := tablet.Type
	if _, err := topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, topodatapb.TabletType_BACKUP, make(map[string]string)); err != nil {
		return err
	}

	// let's update our internal state (stop query service and other things)
	if err := agent.refreshTablet(ctx, "backup"); err != nil {
		return fmt.Errorf("failed to update state before backup: %v", err)
	}

	// create the loggers: tee to console and source
	l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger)

	// now we can run the backup
	dir := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard)
	name := fmt.Sprintf("%v.%v", time.Now().UTC().Format("2006-01-02.150405"), topoproto.TabletAliasString(tablet.Alias))
	returnErr := mysqlctl.Backup(ctx, agent.MysqlDaemon, l, dir, name, concurrency, agent.hookExtraEnv())

	// and change our type back to the appropriate value:
	// - if healthcheck is enabled, go to spare
	// - if not, go back to original type
	if agent.IsRunningHealthCheck() {
		originalType = topodatapb.TabletType_SPARE
	}
	_, err = topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, originalType, nil)
	if err != nil {
		// failure in changing the topology type is probably worse,
		// so returning that (we logged the snapshot error anyway)
		if returnErr != nil {
			l.Errorf("mysql backup command returned error: %v", returnErr)
		}
		returnErr = err
	}

	return returnErr
}
Exemplo n.º 11
0
// SlaveWasPromoted promotes a slave to master, no questions asked.
func (agent *ActionAgent) SlaveWasPromoted(ctx context.Context) error {
	if err := agent.lock(ctx); err != nil {
		return err
	}
	defer agent.unlock()

	if _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER); err != nil {
		return err
	}

	if err := agent.refreshTablet(ctx, "SlaveWasPromoted"); err != nil {
		return err
	}

	return nil
}
Exemplo n.º 12
0
// ChangeTypeNoRebuild changes a tablet's type, and returns whether
// there's a shard that should be rebuilt, along with its cell,
// keyspace, and shard. If force is true, it will bypass the vtaction
// system and make the data change directly, and not run the remote
// hooks.
//
// Note we don't update the master record in the Shard here, as we
// can't ChangeType from and out of master anyway.
func (wr *Wrangler) ChangeTypeNoRebuild(tabletAlias topo.TabletAlias, tabletType topo.TabletType, force bool) (rebuildRequired bool, cell, keyspace, shard string, err error) {
	// Load tablet to find keyspace and shard assignment.
	// Don't load after the ChangeType which might have unassigned
	// the tablet.
	ti, err := wr.ts.GetTablet(tabletAlias)
	if err != nil {
		return false, "", "", "", err
	}

	if force {
		if err := topotools.ChangeType(wr.ts, tabletAlias, tabletType, nil, false); err != nil {
			return false, "", "", "", err
		}
	} else {
		if wr.UseRPCs {
			if err := wr.ai.RpcChangeType(ti, tabletType, wr.ActionTimeout()); err != nil {
				return false, "", "", "", err
			}

		} else {
			// the remote action will run the hooks
			actionPath, err := wr.ai.ChangeType(tabletAlias, tabletType)
			if err != nil {
				return false, "", "", "", err
			}

			// You don't have a choice - you must wait for
			// completion before rebuilding.
			if err := wr.WaitForCompletion(actionPath); err != nil {
				return false, "", "", "", err
			}
		}
	}

	if !ti.Tablet.IsInServingGraph() {
		// re-read the tablet, see if we become serving
		ti, err = wr.ts.GetTablet(tabletAlias)
		if err != nil {
			return false, "", "", "", err
		}
		if !ti.Tablet.IsInServingGraph() {
			return false, "", "", "", nil
		}
	}
	return true, ti.Alias.Cell, ti.Keyspace, ti.Shard, nil

}
Exemplo n.º 13
0
// PromoteSlave makes the current tablet the master
func (agent *ActionAgent) PromoteSlave(ctx context.Context) (string, error) {
	pos, err := agent.MysqlDaemon.PromoteSlave(agent.hookExtraEnv())
	if err != nil {
		return "", err
	}

	// Set the server read-write
	if err := agent.MysqlDaemon.SetReadOnly(false); err != nil {
		return "", err
	}

	if _, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER, topotools.ClearHealthMap); err != nil {
		return "", err
	}

	return replication.EncodePosition(pos), nil
}
Exemplo n.º 14
0
// ChangeType changes the tablet type
func (agent *ActionAgent) ChangeType(ctx context.Context, tabletType topodatapb.TabletType) error {
	if err := agent.lock(ctx); err != nil {
		return err
	}
	defer agent.unlock()

	// change our type in the topology
	_, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, tabletType)
	if err != nil {
		return err
	}

	// let's update our internal state (stop query service and other things)
	if err := agent.refreshTablet(ctx, "ChangeType"); err != nil {
		return err
	}

	// and re-run health check
	agent.runHealthCheckLocked()
	return nil
}
Exemplo n.º 15
0
// RunHealthCheck takes the action mutex, runs the health check,
// and if we need to change our state, do it.
// If we are the master, we don't change our type, healthy or not.
// If we are not the master, we change to spare if not healthy,
// or to the passed in targetTabletType if healthy.
//
// Note we only update the topo record if we need to, that is if our type or
// health details changed.
func (agent *ActionAgent) RunHealthCheck(targetTabletType topo.TabletType, lockTimeout time.Duration) {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()

	// read the current tablet record
	agent.mutex.Lock()
	tablet := agent._tablet
	agent.mutex.Unlock()

	// run the health check
	typeForHealthCheck := targetTabletType
	if tablet.Type == topo.TYPE_MASTER {
		typeForHealthCheck = topo.TYPE_MASTER
	}
	health, err := health.Run(typeForHealthCheck)

	// start with no change
	newTabletType := tablet.Type
	if err != nil {
		if tablet.Type != targetTabletType {
			log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err)
			return
		}
		log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err)
		newTabletType = topo.TYPE_SPARE
	} else {
		// We are healthy, maybe with health, see if we need
		// to update the record. We only change from spare to
		// our target type.
		if tablet.Type == topo.TYPE_SPARE {
			newTabletType = targetTabletType
		}
		if tablet.Type == newTabletType && tablet.IsHealthEqual(health) {
			// no change in health, not logging anything,
			// and we're done
			return
		}

		// we need to update our state
		log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health)
	}

	// Change the Type, update the health. Note we pass in a map
	// that's not nil, meaning if it's empty, we will clear it.
	if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil {
		log.Infof("Error updating tablet record: %v", err)
		return
	}

	// Rebuild the serving graph in our cell, only if we're dealing with
	// a serving type
	if topo.IsInServingGraph(targetTabletType) {
		// TODO: interrupted may need to be a global one closed when we exit
		interrupted := make(chan struct{})

		if *topotools.UseSrvShardLocks {
			// no need to take the shard lock in this case
			if err := topotools.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topotools.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}, lockTimeout, interrupted); err != nil {
				log.Warningf("topotools.RebuildShard returned an error: %v", err)
				return
			}
		} else {
			actionNode := actionnode.RebuildShard()
			lockPath, err := actionNode.LockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockTimeout, interrupted)
			if err != nil {
				log.Warningf("Cannot lock shard for rebuild: %v", err)
				return
			}
			err = topotools.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topotools.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}, lockTimeout, interrupted)
			err = actionNode.UnlockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockPath, err)
			if err != nil {
				log.Warningf("UnlockShard returned an error: %v", err)
				return
			}
		}
	}

	// run the post action callbacks
	agent.afterAction("healthcheck", false /* reloadSchema */)
}
Exemplo n.º 16
0
// Operate on restore tablet.
// Check that the SnapshotManifest is valid and the master has not changed.
// Shutdown mysqld.
// Load the snapshot from source tablet.
// Restart mysqld and replication.
// Put tablet into the replication graph as a spare.
// Should be called under RpcWrapLockAction.
func (agent *ActionAgent) Restore(args *actionnode.RestoreArgs, logger logutil.Logger) error {
	// read our current tablet, verify its state
	tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias)
	if err != nil {
		return err
	}
	if args.WasReserved {
		if tablet.Type != topo.TYPE_RESTORE {
			return fmt.Errorf("expected restore type, not %v", tablet.Type)
		}
	} else {
		if tablet.Type != topo.TYPE_IDLE {
			return fmt.Errorf("expected idle type, not %v", tablet.Type)
		}
	}
	// read the source tablet, compute args.SrcFilePath if default
	sourceTablet, err := agent.TopoServer.GetTablet(args.SrcTabletAlias)
	if err != nil {
		return err
	}
	if strings.ToLower(args.SrcFilePath) == "default" {
		args.SrcFilePath = path.Join(mysqlctl.SnapshotURLPath, mysqlctl.SnapshotManifestFile)
	}

	// read the parent tablet, verify its state
	parentTablet, err := agent.TopoServer.GetTablet(args.ParentAlias)
	if err != nil {
		return err
	}
	if parentTablet.Type != topo.TYPE_MASTER && parentTablet.Type != topo.TYPE_SNAPSHOT_SOURCE {
		return fmt.Errorf("restore expected master or snapshot_source parent: %v %v", parentTablet.Type, args.ParentAlias)
	}

	// read & unpack the manifest
	sm := new(mysqlctl.SnapshotManifest)
	if err := fetchAndParseJsonFile(sourceTablet.Addr(), args.SrcFilePath, sm); err != nil {
		return err
	}

	if !args.WasReserved {
		if err := agent.changeTypeToRestore(tablet, sourceTablet, parentTablet.Alias, sourceTablet.KeyRange); err != nil {
			return err
		}
	}

	// create the loggers: tee to console and source
	l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger)

	// do the work
	if err := agent.Mysqld.RestoreFromSnapshot(l, sm, args.FetchConcurrency, args.FetchRetryCount, args.DontWaitForSlaveStart, agent.hookExtraEnv()); err != nil {
		log.Errorf("RestoreFromSnapshot failed (%v), scrapping", err)
		if err := topotools.Scrap(agent.TopoServer, agent.TabletAlias, false); err != nil {
			log.Errorf("Failed to Scrap after failed RestoreFromSnapshot: %v", err)
		}

		return err
	}

	// reload the schema
	agent.ReloadSchema()

	// change to TYPE_SPARE, we're done!
	return topotools.ChangeType(agent.TopoServer, agent.TabletAlias, topo.TYPE_SPARE, nil, true)
}
Exemplo n.º 17
0
// Snapshot takes a db snapshot
// Should be called under RpcWrapLockAction.
func (agent *ActionAgent) Snapshot(args *actionnode.SnapshotArgs, logger logutil.Logger) (*actionnode.SnapshotReply, error) {
	// update our type to TYPE_BACKUP
	tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias)
	if err != nil {
		return nil, err
	}
	originalType := tablet.Type

	// ForceMasterSnapshot: Normally a master is not a viable tablet
	// to snapshot.  However, there are degenerate cases where you need
	// to override this, for instance the initial clone of a new master.
	if tablet.Type == topo.TYPE_MASTER && args.ForceMasterSnapshot {
		// In this case, we don't bother recomputing the serving graph.
		// All queries will have to fail anyway.
		log.Infof("force change type master -> backup")
		// There is a legitimate reason to force in the case of a single
		// master.
		tablet.Tablet.Type = topo.TYPE_BACKUP
		err = topo.UpdateTablet(agent.TopoServer, tablet)
	} else {
		err = topotools.ChangeType(agent.TopoServer, tablet.Alias, topo.TYPE_BACKUP, make(map[string]string), true /*runHooks*/)
	}
	if err != nil {
		return nil, err
	}

	// let's update our internal state (stop query service and other things)
	if err := agent.refreshTablet("snapshotStart"); err != nil {
		return nil, fmt.Errorf("failed to update state before snaphost: %v", err)
	}

	// create the loggers: tee to console and source
	l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger)

	// now we can run the backup
	filename, slaveStartRequired, readOnly, returnErr := agent.Mysqld.CreateSnapshot(l, tablet.DbName(), tablet.Addr(), false, args.Concurrency, args.ServerMode, agent.hookExtraEnv())

	// and change our type to the appropriate value
	newType := originalType
	if returnErr != nil {
		log.Errorf("snapshot failed, restoring tablet type back to %v: %v", newType, returnErr)
	} else {
		if args.ServerMode {
			log.Infof("server mode specified, switching tablet to snapshot_source mode")
			newType = topo.TYPE_SNAPSHOT_SOURCE
		} else {
			log.Infof("change type back after snapshot: %v", newType)
		}
	}
	if tablet.Parent.Uid == topo.NO_TABLET && args.ForceMasterSnapshot && newType != topo.TYPE_SNAPSHOT_SOURCE {
		log.Infof("force change type backup -> master: %v", tablet.Alias)
		tablet.Tablet.Type = topo.TYPE_MASTER
		err = topo.UpdateTablet(agent.TopoServer, tablet)
	} else {
		err = topotools.ChangeType(agent.TopoServer, tablet.Alias, newType, nil, true /*runHooks*/)
	}
	if err != nil {
		// failure in changing the topology type is probably worse,
		// so returning that (we logged the snapshot error anyway)
		returnErr = err
	}

	// if anything failed, don't return anything
	if returnErr != nil {
		return nil, returnErr
	}

	// it all worked, return the required information
	sr := &actionnode.SnapshotReply{
		ManifestPath:       filename,
		SlaveStartRequired: slaveStartRequired,
		ReadOnly:           readOnly,
	}
	if tablet.Parent.Uid == topo.NO_TABLET {
		// If this is a master, this will be the new parent.
		// FIXME(msolomon) this doesn't work in hierarchical replication.
		sr.ParentAlias = tablet.Alias
	} else {
		sr.ParentAlias = tablet.Parent
	}
	return sr, nil
}
Exemplo n.º 18
0
func (tm *TabletManager) ChangeType(context *rpcproto.Context, args *topo.TabletType, reply *rpc.UnusedResponse) error {
	return tm.agent.RpcWrapLockAction(context.RemoteAddr, actionnode.TABLET_ACTION_CHANGE_TYPE, args, reply, func() error {
		return topotools.ChangeType(tm.agent.TopoServer, tm.agent.TabletAlias, *args, nil, true /*runHooks*/)
	})
}
Exemplo n.º 19
0
// SlaveWasPromoted promotes a slave to master, no questions asked.
// Should be called under RPCWrapLockAction.
func (agent *ActionAgent) SlaveWasPromoted(ctx context.Context) error {
	_, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, topodatapb.TabletType_MASTER, topotools.ClearHealthMap)
	return err
}
Exemplo n.º 20
0
// ChangeType changes the tablet type
// Should be called under RPCWrapLockAction.
func (agent *ActionAgent) ChangeType(ctx context.Context, tabletType topodatapb.TabletType) error {
	_, err := topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, tabletType, nil)
	return err
}
Exemplo n.º 21
0
// runHealthCheck takes the action mutex, runs the health check,
// and if we need to change our state, do it.
// If we are the master, we don't change our type, healthy or not.
// If we are not the master, we change to spare if not healthy,
// or to the passed in targetTabletType if healthy.
//
// Note we only update the topo record if we need to, that is if our type or
// health details changed.
func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType) {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()

	// read the current tablet record and blacklisted tables
	agent.mutex.Lock()
	tablet := agent._tablet
	blacklistedTables := agent._blacklistedTables
	agent.mutex.Unlock()

	// run the health check
	typeForHealthCheck := targetTabletType
	if tablet.Type == topo.TYPE_MASTER {
		typeForHealthCheck = topo.TYPE_MASTER
	}
	health, err := health.Run(typeForHealthCheck)

	// Figure out if we should be running QueryService. If we should,
	// and we aren't, and we're otherwise healthy, try to start it.
	if err == nil && topo.IsRunningQueryService(targetTabletType) && agent.BinlogPlayerMap.size() == 0 {
		err = agent.allowQueries(tablet.Tablet, blacklistedTables)
	}

	// save the health record
	record := &HealthRecord{
		Error:  err,
		Result: health,
		Time:   time.Now(),
	}
	agent.History.Add(record)

	// try to figure out the mysql port if we don't have it yet
	if _, ok := tablet.Portmap["mysql"]; !ok {
		// we don't know the port, try to get it from mysqld
		mysqlPort, err := agent.Mysqld.GetMysqlPort()
		if err != nil {
			// Don't log if we're already in a waiting-for-mysql state.
			agent.mutex.Lock()
			if !agent._waitingForMysql {
				log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err)
				agent._waitingForMysql = true
			}
			agent.mutex.Unlock()
		} else {
			log.Infof("Updating tablet mysql port to %v", mysqlPort)
			if err := agent.TopoServer.UpdateTabletFields(tablet.Alias, func(tablet *topo.Tablet) error {
				tablet.Portmap["mysql"] = mysqlPort
				return nil
			}); err != nil {
				log.Infof("Error updating mysql port in tablet record: %v", err)
				return
			}

			// save the port so we don't update it again next time
			// we do the health check.
			agent.mutex.Lock()
			agent._tablet.Portmap["mysql"] = mysqlPort
			agent._waitingForMysql = false
			agent.mutex.Unlock()
		}
	}

	// Update our topo.Server state, start with no change
	newTabletType := tablet.Type
	if err != nil {
		// The tablet is not healthy, let's see what we need to do
		if tablet.Type != targetTabletType {
			if tablet.Type != topo.TYPE_SPARE {
				// we only log if we're not in spare,
				// as the spare state is normal for a
				// failed health check.
				log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err)
			}
			return
		}

		// Note that if the query service is running, we may
		// need to stop it. The post-action callback will do
		// it, and it will be done after we change our state,
		// so it's the right order, let it do it.
		log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err)
		newTabletType = topo.TYPE_SPARE
	} else {
		// We are healthy, maybe with health, see if we need
		// to update the record. We only change from spare to
		// our target type.
		if tablet.Type == topo.TYPE_SPARE {
			newTabletType = targetTabletType
		}
		if tablet.Type == newTabletType && tablet.IsHealthEqual(health) {
			// no change in health, not logging anything,
			// and we're done
			return
		}

		// we need to update our state
		log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health)
		agent.lastHealthMapCount.Set(int64(len(health)))
	}

	// Change the Type, update the health. Note we pass in a map
	// that's not nil, meaning if it's empty, we will clear it.
	if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil {
		log.Infof("Error updating tablet record: %v", err)
		return
	}

	// Rebuild the serving graph in our cell, only if we're dealing with
	// a serving type
	if err := agent.rebuildShardIfNeeded(tablet, targetTabletType); err != nil {
		log.Warningf("rebuildShardIfNeeded failed (will still run post action callbacks, serving graph might be out of date): %v", err)
	}

	// run the post action callbacks, not much we can do with returned error
	if err := agent.refreshTablet("healthcheck"); err != nil {
		log.Warningf("refreshTablet failed: %v", err)
	}
}
Exemplo n.º 22
0
// Operate on restore tablet.
// Check that the SnapshotManifest is valid and the master has not changed.
// Shutdown mysqld.
// Load the snapshot from source tablet.
// Restart mysqld and replication.
// Put tablet into the replication graph as a spare.
func (ta *TabletActor) restore(actionNode *actionnode.ActionNode) error {
	args := actionNode.Args.(*actionnode.RestoreArgs)

	// read our current tablet, verify its state
	tablet, err := ta.ts.GetTablet(ta.tabletAlias)
	if err != nil {
		return err
	}
	if args.WasReserved {
		if tablet.Type != topo.TYPE_RESTORE {
			return fmt.Errorf("expected restore type, not %v: %v", tablet.Type, ta.tabletAlias)
		}
	} else {
		if tablet.Type != topo.TYPE_IDLE {
			return fmt.Errorf("expected idle type, not %v: %v", tablet.Type, ta.tabletAlias)
		}
	}

	// read the source tablet, compute args.SrcFilePath if default
	sourceTablet, err := ta.ts.GetTablet(args.SrcTabletAlias)
	if err != nil {
		return err
	}
	if strings.ToLower(args.SrcFilePath) == "default" {
		args.SrcFilePath = path.Join(mysqlctl.SnapshotURLPath, mysqlctl.SnapshotManifestFile)
	}

	// read the parent tablet, verify its state
	parentTablet, err := ta.ts.GetTablet(args.ParentAlias)
	if err != nil {
		return err
	}
	if parentTablet.Type != topo.TYPE_MASTER && parentTablet.Type != topo.TYPE_SNAPSHOT_SOURCE {
		return fmt.Errorf("restore expected master or snapshot_source parent: %v %v", parentTablet.Type, args.ParentAlias)
	}

	// read & unpack the manifest
	sm := new(mysqlctl.SnapshotManifest)
	if err := fetchAndParseJsonFile(sourceTablet.Addr(), args.SrcFilePath, sm); err != nil {
		return err
	}

	if !args.WasReserved {
		if err := ta.changeTypeToRestore(tablet, sourceTablet, parentTablet.Alias, sourceTablet.KeyRange); err != nil {
			return err
		}
	}

	// do the work
	if err := ta.mysqld.RestoreFromSnapshot(sm, args.FetchConcurrency, args.FetchRetryCount, args.DontWaitForSlaveStart, ta.hookExtraEnv()); err != nil {
		log.Errorf("RestoreFromSnapshot failed (%v), scrapping", err)
		if err := topotools.Scrap(ta.ts, ta.tabletAlias, false); err != nil {
			log.Errorf("Failed to Scrap after failed RestoreFromSnapshot: %v", err)
		}

		return err
	}

	// change to TYPE_SPARE, we're done!
	return topotools.ChangeType(ta.ts, ta.tabletAlias, topo.TYPE_SPARE, nil, true)
}
Exemplo n.º 23
0
func (ta *TabletActor) changeType(actionNode *actionnode.ActionNode) error {
	dbType := actionNode.Args.(*topo.TabletType)
	return topotools.ChangeType(ta.ts, ta.tabletAlias, *dbType, nil, true /*runHooks*/)
}
Exemplo n.º 24
0
// ChangeType changes the tablet type
// Should be called under RpcWrapLockAction.
func (agent *ActionAgent) ChangeType(tabletType topo.TabletType) error {
	return topotools.ChangeType(agent.TopoServer, agent.TabletAlias, tabletType, nil, true /*runHooks*/)
}
Exemplo n.º 25
0
// runHealthCheck takes the action mutex, runs the health check,
// and if we need to change our state, do it.
// If we are the master, we don't change our type, healthy or not.
// If we are not the master, we change to spare if not healthy,
// or to the passed in targetTabletType if healthy.
//
// Note we only update the topo record if we need to, that is if our type or
// health details changed.
func (agent *ActionAgent) runHealthCheck(targetTabletType pbt.TabletType) {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()

	// read the current tablet record and tablet control
	agent.mutex.Lock()
	tablet := agent._tablet
	tabletControl := agent._tabletControl
	agent.mutex.Unlock()

	// figure out if we should be running the query service
	shouldQueryServiceBeRunning := false
	var blacklistedTables []string
	if topo.IsRunningQueryService(targetTabletType) && !agent.BinlogPlayerMap.isRunningFilteredReplication() {
		shouldQueryServiceBeRunning = true
		if tabletControl != nil {
			blacklistedTables = tabletControl.BlacklistedTables
			if tabletControl.DisableQueryService {
				shouldQueryServiceBeRunning = false
			}
		}
	}

	// run the health check
	typeForHealthCheck := targetTabletType
	if tablet.Type == pbt.TabletType_MASTER {
		typeForHealthCheck = pbt.TabletType_MASTER
	}
	replicationDelay, err := agent.HealthReporter.Report(topo.IsSlaveType(typeForHealthCheck), shouldQueryServiceBeRunning)
	health := make(map[string]string)
	if err == nil {
		if replicationDelay > *unhealthyThreshold {
			err = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds())
		} else if replicationDelay > *degradedThreshold {
			health[topo.ReplicationLag] = topo.ReplicationLagHigh
		}
	}
	agent.lastHealthMapCount.Set(int64(len(health)))

	// Figure out if we should be running QueryService, see if we are,
	// and reconcile.
	if err != nil {
		if tablet.Type != pbt.TabletType_WORKER {
			// We are not healthy and must shut down QueryService.
			// At the moment, the only exception to this are "worker" tablets which
			// still must serve queries e.g. as source tablet during a "SplitClone".
			shouldQueryServiceBeRunning = false
		}
	}
	isQueryServiceRunning := agent.QueryServiceControl.IsServing()
	if shouldQueryServiceBeRunning {
		if !isQueryServiceRunning {
			// send the type we want to be, not the type we are
			currentType := tablet.Type
			if tablet.Type == pbt.TabletType_SPARE {
				tablet.Type = targetTabletType
			}

			// we remember this new possible error
			err = agent.allowQueries(tablet.Tablet, blacklistedTables)

			// restore the current type
			tablet.Type = currentType
		}
	} else {
		if isQueryServiceRunning {
			// we are not healthy or should not be running the
			// query service, shut it down.
			agent.stopQueryService()
		}
	}

	// save the health record
	record := &HealthRecord{
		Error:            err,
		ReplicationDelay: replicationDelay,
		Time:             time.Now(),
	}
	agent.History.Add(record)

	// try to figure out the mysql port if we don't have it yet
	if _, ok := tablet.PortMap["mysql"]; !ok {
		// we don't know the port, try to get it from mysqld
		mysqlPort, err := agent.MysqlDaemon.GetMysqlPort()
		if err != nil {
			// Don't log if we're already in a waiting-for-mysql state.
			agent.mutex.Lock()
			if !agent._waitingForMysql {
				log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err)
				agent._waitingForMysql = true
			}
			agent.mutex.Unlock()
		} else {
			log.Infof("Updating tablet mysql port to %v", mysqlPort)
			if err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *pbt.Tablet) error {
				tablet.PortMap["mysql"] = int32(mysqlPort)
				return nil
			}); err != nil {
				log.Infof("Error updating mysql port in tablet record: %v", err)
				return
			}

			// save the port so we don't update it again next time
			// we do the health check.
			agent.mutex.Lock()
			agent._tablet.PortMap["mysql"] = int32(mysqlPort)
			agent._waitingForMysql = false
			agent.mutex.Unlock()
		}
	}

	// remember our health status
	agent.mutex.Lock()
	agent._healthy = err
	agent._healthyTime = time.Now()
	agent._replicationDelay = replicationDelay
	terTime := agent._tabletExternallyReparentedTime
	agent.mutex.Unlock()

	// send it to our observers
	// (the Target has already been updated when restarting the
	// query service earlier)
	// FIXME(alainjobart,liguo) add CpuUsage
	stats := &pb.RealtimeStats{
		SecondsBehindMaster: uint32(replicationDelay.Seconds()),
	}
	stats.SecondsBehindMasterFilteredReplication, stats.BinlogPlayersCount = agent.BinlogPlayerMap.StatusSummary()
	if err != nil {
		stats.HealthError = err.Error()
	}
	defer func() {
		var ts int64
		if !terTime.IsZero() {
			ts = terTime.Unix()
		}
		agent.QueryServiceControl.BroadcastHealth(ts, stats)
	}()

	// Update our topo.Server state, start with no change
	newTabletType := tablet.Type
	if err != nil {
		// The tablet is not healthy, let's see what we need to do
		if tablet.Type != targetTabletType {
			if tablet.Type != pbt.TabletType_SPARE {
				// we only log if we're not in spare,
				// as the spare state is normal for a
				// failed health check.
				log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err)
			}
			return
		}

		// Note that if the query service is running, we may
		// need to stop it. The post-action callback will do
		// it, and it will be done after we change our state,
		// so it's the right order, let it do it.
		log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err)
		newTabletType = pbt.TabletType_SPARE
	} else {
		// We are healthy, maybe with health, see if we need
		// to update the record. We only change from spare to
		// our target type.
		if tablet.Type == pbt.TabletType_SPARE {
			newTabletType = targetTabletType
		}
		if tablet.Type == newTabletType && topo.IsHealthEqual(health, tablet.HealthMap) {
			// no change in health, not logging anything,
			// and we're done
			return
		}

		// we need to update our state
		log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.HealthMap, health)
	}

	// Change the Type, update the health. Note we pass in a map
	// that's not nil, meaning if it's empty, we will clear it.
	if err := topotools.ChangeType(agent.batchCtx, agent.TopoServer, tablet.Alias, newTabletType, health); err != nil {
		log.Infof("Error updating tablet record: %v", err)
		return
	}
	tablet.HealthMap = health
	tablet.Type = newTabletType

	// Rebuild the serving graph in our cell, only if we're dealing with
	// a serving type
	if err := agent.updateServingGraph(tablet, targetTabletType); err != nil {
		log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err)
	}

	// run the post action callbacks, not much we can do with returned error
	if err := agent.refreshTablet(agent.batchCtx, "healthcheck"); err != nil {
		log.Warningf("refreshTablet failed: %v", err)
	}
}
Exemplo n.º 26
0
// ChangeType changes the tablet type
// Should be called under RPCWrapLockAction.
func (agent *ActionAgent) ChangeType(ctx context.Context, tabletType pb.TabletType) error {
	return topotools.ChangeType(ctx, agent.TopoServer, agent.TabletAlias, tabletType, nil)
}
Exemplo n.º 27
0
// runHealthCheck takes the action mutex, runs the health check,
// and if we need to change our state, do it.
// If we are the master, we don't change our type, healthy or not.
// If we are not the master, we change to spare if not healthy,
// or to the passed in targetTabletType if healthy.
//
// Note we only update the topo record if we need to, that is if our type or
// health details changed.
func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType, lockTimeout time.Duration) {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()

	// read the current tablet record
	agent.mutex.Lock()
	tablet := agent._tablet
	agent.mutex.Unlock()

	// run the health check
	typeForHealthCheck := targetTabletType
	if tablet.Type == topo.TYPE_MASTER {
		typeForHealthCheck = topo.TYPE_MASTER
	}
	health, err := health.Run(typeForHealthCheck)

	// Figure out if we should be running QueryService. If we should,
	// and we aren't, and we're otherwise healthy, try to start it
	if err == nil && topo.IsRunningQueryService(targetTabletType) {
		err = agent.allowQueries(tablet.Tablet)
	}

	// save the health record
	record := &HealthRecord{
		Error:  err,
		Result: health,
		Time:   time.Now(),
	}
	agent.History.Add(record)

	// Update our topo.Server state, start with no change
	newTabletType := tablet.Type
	if err != nil {
		// The tablet is not healthy, let's see what we need to do
		if tablet.Type != targetTabletType {
			if tablet.Type != topo.TYPE_SPARE {
				// we onyl log if we're not in spare,
				// as the spare state is normal for a
				// failed health check.
				log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err)
			}
			return
		}

		// Note that if the query service is running, we may
		// need to stop it. The post-action callback will do
		// it, and it will be done after we change our state,
		// so it's the right order, let it do it.
		log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err)
		newTabletType = topo.TYPE_SPARE
	} else {
		// We are healthy, maybe with health, see if we need
		// to update the record. We only change from spare to
		// our target type.
		if tablet.Type == topo.TYPE_SPARE {
			newTabletType = targetTabletType
		}
		if tablet.Type == newTabletType && tablet.IsHealthEqual(health) {
			// no change in health, not logging anything,
			// and we're done
			return
		}

		// we need to update our state
		log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health)
		agent.lastHealthMapCount.Set(int64(len(health)))
	}

	// Change the Type, update the health. Note we pass in a map
	// that's not nil, meaning if it's empty, we will clear it.
	if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil {
		log.Infof("Error updating tablet record: %v", err)
		return
	}

	// Rebuild the serving graph in our cell, only if we're dealing with
	// a serving type
	if err := agent.rebuildShardIfNeeded(tablet, targetTabletType, lockTimeout); err != nil {
		log.Warningf("rebuildShardIfNeeded failed, not running post action callbacks: %v", err)
		return
	}

	// run the post action callbacks
	agent.afterAction("healthcheck", false /* reloadSchema */)
}
Exemplo n.º 28
0
// runHealthCheck takes the action mutex, runs the health check,
// and if we need to change our state, do it.
// If we are the master, we don't change our type, healthy or not.
// If we are not the master, we change to spare if not healthy,
// or to the passed in targetTabletType if healthy.
//
// Note we only update the topo record if we need to, that is if our type or
// health details changed.
func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType) {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()

	// read the current tablet record and tablet control
	agent.mutex.Lock()
	tablet := agent._tablet
	tabletControl := agent._tabletControl
	agent.mutex.Unlock()

	// figure out if we should be running the query service
	shouldQueryServiceBeRunning := false
	var blacklistedTables []string
	if topo.IsRunningQueryService(targetTabletType) && agent.BinlogPlayerMap.size() == 0 {
		shouldQueryServiceBeRunning = true
		if tabletControl != nil {
			blacklistedTables = tabletControl.BlacklistedTables
			if tabletControl.DisableQueryService {
				shouldQueryServiceBeRunning = false
			}
		}
	}

	// run the health check
	typeForHealthCheck := targetTabletType
	if tablet.Type == topo.TYPE_MASTER {
		typeForHealthCheck = topo.TYPE_MASTER
	}
	replicationDelay, err := agent.HealthReporter.Report(topo.IsSlaveType(typeForHealthCheck), shouldQueryServiceBeRunning)
	health := make(map[string]string)
	if err == nil {
		if replicationDelay > *unhealthyThreshold {
			err = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds())
		} else if replicationDelay > *degradedThreshold {
			health[topo.ReplicationLag] = topo.ReplicationLagHigh
		}
	}

	// Figure out if we should be running QueryService, see if we are,
	// and reconcile.
	if err != nil {
		// we are not healthy, we should not be running QueryService
		shouldQueryServiceBeRunning = false
	}
	isQueryServiceRunning := agent.QueryServiceControl.IsServing()
	if shouldQueryServiceBeRunning {
		if !isQueryServiceRunning {
			// we remember this new possible error
			err = agent.allowQueries(tablet.Tablet, blacklistedTables)
		}
	} else {
		if isQueryServiceRunning {
			// we are not healthy or should not be running the
			// query service, shut it down.
			agent.disallowQueries()
		}
	}

	// save the health record
	record := &HealthRecord{
		Error:            err,
		ReplicationDelay: replicationDelay,
		Time:             time.Now(),
	}
	agent.History.Add(record)

	// try to figure out the mysql port if we don't have it yet
	if _, ok := tablet.Portmap["mysql"]; !ok {
		// we don't know the port, try to get it from mysqld
		mysqlPort, err := agent.MysqlDaemon.GetMysqlPort()
		if err != nil {
			// Don't log if we're already in a waiting-for-mysql state.
			agent.mutex.Lock()
			if !agent._waitingForMysql {
				log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err)
				agent._waitingForMysql = true
			}
			agent.mutex.Unlock()
		} else {
			log.Infof("Updating tablet mysql port to %v", mysqlPort)
			if err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *topo.Tablet) error {
				tablet.Portmap["mysql"] = mysqlPort
				return nil
			}); err != nil {
				log.Infof("Error updating mysql port in tablet record: %v", err)
				return
			}

			// save the port so we don't update it again next time
			// we do the health check.
			agent.mutex.Lock()
			agent._tablet.Portmap["mysql"] = mysqlPort
			agent._waitingForMysql = false
			agent.mutex.Unlock()
		}
	}

	// remember our health status
	agent.mutex.Lock()
	agent._healthy = err
	agent._healthyTime = time.Now()
	agent._replicationDelay = replicationDelay
	agent.mutex.Unlock()

	// send it to our observers, after we've updated the tablet state
	// (Tablet is a pointer, and below we will alter the Tablet
	// record to be correct.
	hsr := &actionnode.HealthStreamReply{
		Tablet:              tablet.Tablet,
		BinlogPlayerMapSize: agent.BinlogPlayerMap.size(),
		ReplicationDelay:    replicationDelay,
	}
	if err != nil {
		hsr.HealthError = err.Error()
	}
	defer agent.BroadcastHealthStreamReply(hsr)

	// Update our topo.Server state, start with no change
	newTabletType := tablet.Type
	if err != nil {
		// The tablet is not healthy, let's see what we need to do
		if tablet.Type != targetTabletType {
			if tablet.Type != topo.TYPE_SPARE {
				// we only log if we're not in spare,
				// as the spare state is normal for a
				// failed health check.
				log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err)
			}
			return
		}

		// Note that if the query service is running, we may
		// need to stop it. The post-action callback will do
		// it, and it will be done after we change our state,
		// so it's the right order, let it do it.
		log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err)
		newTabletType = topo.TYPE_SPARE
	} else {
		// We are healthy, maybe with health, see if we need
		// to update the record. We only change from spare to
		// our target type.
		if tablet.Type == topo.TYPE_SPARE {
			newTabletType = targetTabletType
		}
		if tablet.Type == newTabletType && tablet.IsHealthEqual(health) {
			// no change in health, not logging anything,
			// and we're done
			return
		}

		// we need to update our state
		log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health)
		agent.lastHealthMapCount.Set(int64(len(health)))
	}

	// Change the Type, update the health. Note we pass in a map
	// that's not nil, meaning if it's empty, we will clear it.
	if err := topotools.ChangeType(agent.batchCtx, agent.TopoServer, tablet.Alias, newTabletType, health); err != nil {
		log.Infof("Error updating tablet record: %v", err)
		return
	}
	tablet.Health = health
	tablet.Type = newTabletType

	// Rebuild the serving graph in our cell, only if we're dealing with
	// a serving type
	if err := agent.updateServingGraph(tablet, targetTabletType); err != nil {
		log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err)
	}

	// run the post action callbacks, not much we can do with returned error
	if err := agent.refreshTablet(agent.batchCtx, "healthcheck"); err != nil {
		log.Warningf("refreshTablet failed: %v", err)
	}
}