func (agent *ActionAgent) verifyServingAddrs(ctx context.Context) error { ti := agent.Tablet() if !topo.IsRunningQueryService(ti.Type) { return nil } // Check to see our address is registered in the right place. return topotools.UpdateTabletEndpoints(ctx, agent.TopoServer, ti.Tablet) }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType pbt.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record and tablet control agent.mutex.Lock() tablet := agent._tablet tabletControl := agent._tabletControl agent.mutex.Unlock() // figure out if we should be running the query service shouldQueryServiceBeRunning := false var blacklistedTables []string if topo.IsRunningQueryService(targetTabletType) && !agent.BinlogPlayerMap.isRunningFilteredReplication() { shouldQueryServiceBeRunning = true if tabletControl != nil { blacklistedTables = tabletControl.BlacklistedTables if tabletControl.DisableQueryService { shouldQueryServiceBeRunning = false } } } // run the health check typeForHealthCheck := targetTabletType if tablet.Type == pbt.TabletType_MASTER { typeForHealthCheck = pbt.TabletType_MASTER } replicationDelay, err := agent.HealthReporter.Report(topo.IsSlaveType(typeForHealthCheck), shouldQueryServiceBeRunning) health := make(map[string]string) if err == nil { if replicationDelay > *unhealthyThreshold { err = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds()) } else if replicationDelay > *degradedThreshold { health[topo.ReplicationLag] = topo.ReplicationLagHigh } } agent.lastHealthMapCount.Set(int64(len(health))) // Figure out if we should be running QueryService, see if we are, // and reconcile. if err != nil { if tablet.Type != pbt.TabletType_WORKER { // We are not healthy and must shut down QueryService. // At the moment, the only exception to this are "worker" tablets which // still must serve queries e.g. as source tablet during a "SplitClone". shouldQueryServiceBeRunning = false } } isQueryServiceRunning := agent.QueryServiceControl.IsServing() if shouldQueryServiceBeRunning { if !isQueryServiceRunning { // send the type we want to be, not the type we are currentType := tablet.Type if tablet.Type == pbt.TabletType_SPARE { tablet.Type = targetTabletType } // we remember this new possible error err = agent.allowQueries(tablet.Tablet, blacklistedTables) // restore the current type tablet.Type = currentType } } else { if isQueryServiceRunning { // we are not healthy or should not be running the // query service, shut it down. agent.stopQueryService() } } // save the health record record := &HealthRecord{ Error: err, ReplicationDelay: replicationDelay, Time: time.Now(), } agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.PortMap["mysql"]; !ok { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.MysqlDaemon.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) if err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *pbt.Tablet) error { tablet.PortMap["mysql"] = int32(mysqlPort) return nil }); err != nil { log.Infof("Error updating mysql port in tablet record: %v", err) return } // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.PortMap["mysql"] = int32(mysqlPort) agent._waitingForMysql = false agent.mutex.Unlock() } } // remember our health status agent.mutex.Lock() agent._healthy = err agent._healthyTime = time.Now() agent._replicationDelay = replicationDelay terTime := agent._tabletExternallyReparentedTime agent.mutex.Unlock() // send it to our observers // (the Target has already been updated when restarting the // query service earlier) // FIXME(alainjobart,liguo) add CpuUsage stats := &pb.RealtimeStats{ SecondsBehindMaster: uint32(replicationDelay.Seconds()), } stats.SecondsBehindMasterFilteredReplication, stats.BinlogPlayersCount = agent.BinlogPlayerMap.StatusSummary() if err != nil { stats.HealthError = err.Error() } defer func() { var ts int64 if !terTime.IsZero() { ts = terTime.Unix() } agent.QueryServiceControl.BroadcastHealth(ts, stats) }() // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != pbt.TabletType_SPARE { // we only log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = pbt.TabletType_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == pbt.TabletType_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && topo.IsHealthEqual(health, tablet.HealthMap) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.HealthMap, health) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.batchCtx, agent.TopoServer, tablet.Alias, newTabletType, health); err != nil { log.Infof("Error updating tablet record: %v", err) return } tablet.HealthMap = health tablet.Type = newTabletType // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.updateServingGraph(tablet, targetTabletType); err != nil { log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // run the post action callbacks, not much we can do with returned error if err := agent.refreshTablet(agent.batchCtx, "healthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record and blacklisted tables agent.mutex.Lock() tablet := agent._tablet blacklistedTables := agent._blacklistedTables agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) // Figure out if we should be running QueryService. If we should, // and we aren't, and we're otherwise healthy, try to start it. if err == nil && topo.IsRunningQueryService(targetTabletType) && agent.BinlogPlayerMap.size() == 0 { err = agent.allowQueries(tablet.Tablet, blacklistedTables) } // save the health record record := &HealthRecord{ Error: err, Result: health, Time: time.Now(), } agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.Portmap["mysql"]; !ok { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.Mysqld.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) if err := agent.TopoServer.UpdateTabletFields(tablet.Alias, func(tablet *topo.Tablet) error { tablet.Portmap["mysql"] = mysqlPort return nil }); err != nil { log.Infof("Error updating mysql port in tablet record: %v", err) return } // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.Portmap["mysql"] = mysqlPort agent._waitingForMysql = false agent.mutex.Unlock() } } // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topo.TYPE_SPARE { // we only log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) agent.lastHealthMapCount.Set(int64(len(health))) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.rebuildShardIfNeeded(tablet, targetTabletType); err != nil { log.Warningf("rebuildShardIfNeeded failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // run the post action callbacks, not much we can do with returned error if err := agent.refreshTablet("healthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }
// changeCallback is run after every action that might // have changed something in the tablet record. func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *pbt.Tablet) error { span := trace.NewSpanFromContext(ctx) span.StartLocal("ActionAgent.changeCallback") defer span.Finish() allowQuery := topo.IsRunningQueryService(newTablet.Type) // Read the shard to get SourceShards / TabletControlMap if // we're going to use it. var shardInfo *topo.ShardInfo var tabletControl *pbt.Shard_TabletControl var blacklistedTables []string var err error var disallowQueryReason string if allowQuery { shardInfo, err = agent.TopoServer.GetShard(ctx, newTablet.Keyspace, newTablet.Shard) if err != nil { log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err) } else { if newTablet.Type == pbt.TabletType_MASTER { if len(shardInfo.SourceShards) > 0 { allowQuery = false disallowQueryReason = "old master is still in shard info" } } if tc := shardInfo.GetTabletControl(newTablet.Type); tc != nil { if topo.InCellList(newTablet.Alias.Cell, tc.Cells) { if tc.DisableQueryService { allowQuery = false disallowQueryReason = "query service disabled by tablet control" } blacklistedTables = tc.BlacklistedTables tabletControl = tc } } } } else { disallowQueryReason = fmt.Sprintf("not a serving tablet type(%v)", newTablet.Type) } // Read the keyspace on masters to get ShardingColumnType, // for binlog replication, only if source shards are set. var keyspaceInfo *topo.KeyspaceInfo if newTablet.Type == pbt.TabletType_MASTER && shardInfo != nil && len(shardInfo.SourceShards) > 0 { keyspaceInfo, err = agent.TopoServer.GetKeyspace(ctx, newTablet.Keyspace) if err != nil { log.Errorf("Cannot read keyspace for this tablet %v: %v", newTablet.Alias, err) keyspaceInfo = nil } } if allowQuery { // There are a few transitions when we need to restart the query service: switch { // If either InitMaster or InitSlave was called, because those calls // (or a prior call to ResetReplication) may have silently broken the // rowcache invalidator by executing RESET MASTER. // Note that we don't care about fixing it after ResetReplication itself // since that call breaks everything on purpose, and we don't expect // anything to start working until either InitMaster or InitSlave. case agent.initReplication: agent.initReplication = false agent.stopQueryService("initialize replication") // Transitioning from replica to master, so clients that were already // connected don't keep on using the master as replica or rdonly. case newTablet.Type == pbt.TabletType_MASTER && oldTablet.Type != pbt.TabletType_MASTER: agent.stopQueryService("tablet promoted to master") // Having different parameters for the query service. // It needs to stop and restart with the new parameters. // That includes: // - changing KeyRange // - changing the BlacklistedTables list case (newTablet.KeyRange != oldTablet.KeyRange), !reflect.DeepEqual(blacklistedTables, agent.BlacklistedTables()): agent.stopQueryService("keyrange/blacklistedtables changed") } if err := agent.allowQueries(newTablet, blacklistedTables); err != nil { log.Errorf("Cannot start query service: %v", err) } } else { agent.stopQueryService(disallowQueryReason) } // save the tabletControl we've been using, so the background // healthcheck makes the same decisions as we've been making. agent.setTabletControl(tabletControl) // update stream needs to be started or stopped too if agent.DBConfigs != nil { if topo.IsRunningUpdateStream(newTablet.Type) { binlog.EnableUpdateStreamService(agent.DBConfigs.App.DbName, agent.MysqlDaemon) } else { binlog.DisableUpdateStreamService() } } statsType.Set(strings.ToLower(newTablet.Type.String())) statsKeyspace.Set(newTablet.Keyspace) statsShard.Set(newTablet.Shard) if newTablet.KeyRange != nil { statsKeyRangeStart.Set(hex.EncodeToString(newTablet.KeyRange.Start)) statsKeyRangeEnd.Set(hex.EncodeToString(newTablet.KeyRange.End)) } else { statsKeyRangeStart.Set("") statsKeyRangeEnd.Set("") } // See if we need to start or stop any binlog player if agent.BinlogPlayerMap != nil { if newTablet.Type == pbt.TabletType_MASTER { agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, keyspaceInfo, shardInfo) } else { agent.BinlogPlayerMap.StopAllPlayersAndReset() } } return nil }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType, lockTimeout time.Duration) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record agent.mutex.Lock() tablet := agent._tablet agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) // Figure out if we should be running QueryService. If we should, // and we aren't, and we're otherwise healthy, try to start it if err == nil && topo.IsRunningQueryService(targetTabletType) { err = agent.allowQueries(tablet.Tablet) } // save the health record record := &HealthRecord{ Error: err, Result: health, Time: time.Now(), } agent.History.Add(record) // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topo.TYPE_SPARE { // we onyl log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) agent.lastHealthMapCount.Set(int64(len(health))) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.rebuildShardIfNeeded(tablet, targetTabletType, lockTimeout); err != nil { log.Warningf("rebuildShardIfNeeded failed, not running post action callbacks: %v", err) return } // run the post action callbacks agent.afterAction("healthcheck", false /* reloadSchema */) }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. // // This will not change the BinlogPlayerMap, but if it is not empty, // we will think we should not be running the query service. // // This will not change the TabletControl record, but will use it // to see if we should be running the query service. func (agent *ActionAgent) runHealthCheck(targetTabletType topodatapb.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record and tablet control agent.mutex.Lock() tablet := proto.Clone(agent._tablet).(*topodatapb.Tablet) tabletControl := proto.Clone(agent._tabletControl).(*topodatapb.Shard_TabletControl) agent.mutex.Unlock() // figure out if we should be running the query service shouldBeServing := false if topo.IsRunningQueryService(targetTabletType) && !agent.BinlogPlayerMap.isRunningFilteredReplication() { shouldBeServing = true if tabletControl != nil { if tabletControl.DisableQueryService { shouldBeServing = false } } } // run the health check isSlaveType := true if tablet.Type == topodatapb.TabletType_MASTER { isSlaveType = false } // Remember the health error as healthErr to be sure we don't accidentally // overwrite it with some other err. replicationDelay, healthErr := agent.HealthReporter.Report(isSlaveType, shouldBeServing) health := make(map[string]string) if healthErr == nil { if replicationDelay > *unhealthyThreshold { healthErr = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds()) } else if replicationDelay > *degradedThreshold { health[topo.ReplicationLag] = topo.ReplicationLagHigh } } agent.lastHealthMapCount.Set(int64(len(health))) // Figure out if we should be running QueryService, see if we are, // and reconcile. if healthErr != nil { if tablet.Type != topodatapb.TabletType_WORKER { // We are not healthy and must shut down QueryService. // At the moment, the only exception to this are "worker" tablets which // still must serve queries e.g. as source tablet during a "SplitClone". shouldBeServing = false } } isServing := agent.QueryServiceControl.IsServing() if shouldBeServing { if !isServing { // It might be that we're ready to serve, but we just need to start // queryservice. Send the type we want to be, not the type we are. desiredType := tablet.Type if desiredType == topodatapb.TabletType_SPARE { desiredType = targetTabletType } // If starting queryservice fails, that's our new reason for being unhealthy. healthErr = agent.allowQueries(desiredType) } } else { if isServing { // We are not healthy or should not be running the query service. // // We do NOT enter lameduck in this case, because we should only hit this // in the following scenarios: // // * Healthcheck fails: We're probably serving errors anyway, so no point. // * Replication lag exceeds unhealthy threshold: This is very rare, so it // isn't worth optimizing the potential 1s of errors away. It will also // go away when vtgate is the only one looking at lag. // * We're in a special state where queryservice should be disabled // despite being non-SPARE: This is not a live serving instance anyway. agent.disallowQueries(tablet.Type, fmt.Sprintf("health-check failure(%v)", healthErr), ) } } // save the health record record := &HealthRecord{ Error: healthErr, ReplicationDelay: replicationDelay, Time: time.Now(), } agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.PortMap["mysql"]; !ok { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.MysqlDaemon.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) _, err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *topodatapb.Tablet) error { if err := topotools.CheckOwnership(agent.initialTablet, tablet); err != nil { return err } tablet.PortMap["mysql"] = mysqlPort return nil }) if err != nil { log.Infof("Error updating mysql port in tablet record (will try again at healthcheck interval): %v", err) } else { // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.PortMap["mysql"] = mysqlPort agent._waitingForMysql = false agent.mutex.Unlock() } } } // remember our health status agent.mutex.Lock() agent._healthy = healthErr agent._healthyTime = time.Now() agent._replicationDelay = replicationDelay agent.mutex.Unlock() // send it to our observers agent.broadcastHealth() // Update our topo.Server state, start with no change newTabletType := tablet.Type if healthErr != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topodatapb.TabletType_SPARE { // we only log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, healthErr) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, healthErr) newTabletType = topodatapb.TabletType_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topodatapb.TabletType_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && topo.IsHealthEqual(health, tablet.HealthMap) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.HealthMap, health) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. tablet, err := topotools.ChangeOwnType(agent.batchCtx, agent.TopoServer, agent.initialTablet, newTabletType, health) if err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.updateServingGraph(tablet, targetTabletType); err != nil { log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // Run the post action callbacks. // Note that this is where we might block for *gracePeriod, depending on the // type of state change. See changeCallback() for details. if err := agent.refreshTablet(agent.batchCtx, "healthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }
// changeCallback is run after every action that might // have changed something in the tablet record or in the topology. // // It owns making changes to the BinlogPlayerMap. The input for this is the // tablet type (has to be master), and the shard's SourceShards. // // It owns updating the blacklisted tables. // // It owns updating the stats record for 'TabletType'. // // It owns starting and stopping the update stream service. // // It owns reading the TabletControl for the current tablet, and storing it. func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topodatapb.Tablet) { span := trace.NewSpanFromContext(ctx) span.StartLocal("ActionAgent.changeCallback") defer span.Finish() allowQuery := topo.IsRunningQueryService(newTablet.Type) broadcastHealth := false runUpdateStream := allowQuery // Read the shard to get SourceShards / TabletControlMap if // we're going to use it. var shardInfo *topo.ShardInfo var err error var disallowQueryReason string var blacklistedTables []string updateBlacklistedTables := true if allowQuery { shardInfo, err = agent.TopoServer.GetShard(ctx, newTablet.Keyspace, newTablet.Shard) if err != nil { log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err) updateBlacklistedTables = false } else { if newTablet.Type == topodatapb.TabletType_MASTER { if len(shardInfo.SourceShards) > 0 { allowQuery = false disallowQueryReason = "master tablet with filtered replication on" } } if tc := shardInfo.GetTabletControl(newTablet.Type); tc != nil { if topo.InCellList(newTablet.Alias.Cell, tc.Cells) { if tc.DisableQueryService { allowQuery = false disallowQueryReason = "TabletControl.DisableQueryService set" } blacklistedTables = tc.BlacklistedTables } } } } else { disallowQueryReason = fmt.Sprintf("not a serving tablet type(%v)", newTablet.Type) } agent.setServicesDesiredState(disallowQueryReason, runUpdateStream) if updateBlacklistedTables { if err := agent.loadBlacklistRules(newTablet, blacklistedTables); err != nil { // FIXME(alainjobart) how to handle this error? log.Errorf("Cannot update blacklisted tables rule: %v", err) } else { agent.setBlacklistedTables(blacklistedTables) } } if allowQuery { // Query service should be running. if oldTablet.Type == topodatapb.TabletType_REPLICA && newTablet.Type == topodatapb.TabletType_MASTER { // When promoting from replica to master, allow both master and replica // queries to be served during gracePeriod. if _, err := agent.QueryServiceControl.SetServingType(newTablet.Type, true, []topodatapb.TabletType{oldTablet.Type}); err == nil { // If successful, broadcast to vtgate and then wait. agent.broadcastHealth() time.Sleep(*gracePeriod) } else { log.Errorf("Can't start query service for MASTER+REPLICA mode: %v", err) } } if stateChanged, err := agent.QueryServiceControl.SetServingType(newTablet.Type, true, nil); err == nil { // If the state changed, broadcast to vtgate. // (e.g. this happens when the tablet was already master, but it just // changed from NOT_SERVING to SERVING due to // "vtctl MigrateServedFrom ... master".) if stateChanged { broadcastHealth = true } } else { runUpdateStream = false log.Errorf("Cannot start query service: %v", err) } } else { // Query service should be stopped. if topo.IsSubjectToLameduck(oldTablet.Type) && newTablet.Type == topodatapb.TabletType_SPARE && *gracePeriod > 0 { // When a non-MASTER serving type is going SPARE, // put query service in lameduck during gracePeriod. agent.lameduck(disallowQueryReason) } log.Infof("Disabling query service on type change, reason: %v", disallowQueryReason) if stateChanged, err := agent.QueryServiceControl.SetServingType(newTablet.Type, false, nil); err == nil { // If the state changed, broadcast to vtgate. // (e.g. this happens when the tablet was already master, but it just // changed from SERVING to NOT_SERVING because filtered replication was // enabled.) if stateChanged { broadcastHealth = true } } else { log.Errorf("SetServingType(serving=false) failed: %v", err) } } // update stream needs to be started or stopped too if topo.IsRunningUpdateStream(newTablet.Type) && runUpdateStream { agent.UpdateStream.Enable() } else { agent.UpdateStream.Disable() } // upate the stats to our current type if agent.exportStats { agent.statsTabletType.Set(topoproto.TabletTypeLString(newTablet.Type)) } // See if we need to start or stop any binlog player if agent.BinlogPlayerMap != nil { if newTablet.Type == topodatapb.TabletType_MASTER { agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, shardInfo) } else { agent.BinlogPlayerMap.StopAllPlayersAndReset() } } // Broadcast health changes to vtgate immediately. if broadcastHealth { agent.broadcastHealth() } }
// changeCallback is run after every action that might // have changed something in the tablet record or in the topology. func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *pbt.Tablet) error { span := trace.NewSpanFromContext(ctx) span.StartLocal("ActionAgent.changeCallback") defer span.Finish() allowQuery := topo.IsRunningQueryService(newTablet.Type) // Read the shard to get SourceShards / TabletControlMap if // we're going to use it. var shardInfo *topo.ShardInfo var tabletControl *pbt.Shard_TabletControl var blacklistedTables []string var err error var disallowQueryReason string if allowQuery { shardInfo, err = agent.TopoServer.GetShard(ctx, newTablet.Keyspace, newTablet.Shard) if err != nil { log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err) } else { if newTablet.Type == pbt.TabletType_MASTER { if len(shardInfo.SourceShards) > 0 { allowQuery = false disallowQueryReason = "master tablet with filtered replication on" } } if tc := shardInfo.GetTabletControl(newTablet.Type); tc != nil { if topo.InCellList(newTablet.Alias.Cell, tc.Cells) { if tc.DisableQueryService { allowQuery = false disallowQueryReason = "query service disabled by tablet control" } blacklistedTables = tc.BlacklistedTables tabletControl = tc } } } } else { disallowQueryReason = fmt.Sprintf("not a serving tablet type(%v)", newTablet.Type) } if allowQuery { if err := agent.allowQueries(newTablet, blacklistedTables); err != nil { log.Errorf("Cannot start query service: %v", err) } } else { agent.disallowQueries(newTablet, disallowQueryReason) } // save the tabletControl we've been using, so the background // healthcheck makes the same decisions as we've been making. agent.setTabletControl(tabletControl) // update stream needs to be started or stopped too if topo.IsRunningUpdateStream(newTablet.Type) { agent.UpdateStream.Enable() } else { agent.UpdateStream.Disable() } statsType.Set(strings.ToLower(newTablet.Type.String())) statsKeyspace.Set(newTablet.Keyspace) statsShard.Set(newTablet.Shard) if newTablet.KeyRange != nil { statsKeyRangeStart.Set(hex.EncodeToString(newTablet.KeyRange.Start)) statsKeyRangeEnd.Set(hex.EncodeToString(newTablet.KeyRange.End)) } else { statsKeyRangeStart.Set("") statsKeyRangeEnd.Set("") } // See if we need to start or stop any binlog player if agent.BinlogPlayerMap != nil { if newTablet.Type == pbt.TabletType_MASTER { // Read the keyspace on masters to get // ShardingColumnType, for binlog replication, // only if source shards are set. var keyspaceInfo *topo.KeyspaceInfo if shardInfo != nil && len(shardInfo.SourceShards) > 0 { keyspaceInfo, err = agent.TopoServer.GetKeyspace(ctx, newTablet.Keyspace) if err != nil { keyspaceInfo = nil } } agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, keyspaceInfo, shardInfo) } else { agent.BinlogPlayerMap.StopAllPlayersAndReset() } } return nil }
// changeCallback is run after every action that might // have changed something in the tablet record or in the topology. // // It owns making changes to the BinlogPlayerMap. The input for this is the // tablet type (has to be master), and the shard's SourceShards. // // It owns updating the blacklisted tables. // // It owns updating the stats record for 'TabletType'. // // It owns starting and stopping the update stream service. // // It owns reading the TabletControl for the current tablet, and storing it. func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topodatapb.Tablet) error { span := trace.NewSpanFromContext(ctx) span.StartLocal("ActionAgent.changeCallback") defer span.Finish() allowQuery := topo.IsRunningQueryService(newTablet.Type) // Read the shard to get SourceShards / TabletControlMap if // we're going to use it. var shardInfo *topo.ShardInfo var tabletControl *topodatapb.Shard_TabletControl var err error var disallowQueryReason string var blacklistedTables []string updateBlacklistedTables := true if allowQuery { shardInfo, err = agent.TopoServer.GetShard(ctx, newTablet.Keyspace, newTablet.Shard) if err != nil { log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err) updateBlacklistedTables = false } else { if newTablet.Type == topodatapb.TabletType_MASTER { if len(shardInfo.SourceShards) > 0 { allowQuery = false disallowQueryReason = "master tablet with filtered replication on" } } if tc := shardInfo.GetTabletControl(newTablet.Type); tc != nil { if topo.InCellList(newTablet.Alias.Cell, tc.Cells) { if tc.DisableQueryService { allowQuery = false disallowQueryReason = "query service disabled by tablet control" } blacklistedTables = tc.BlacklistedTables tabletControl = tc } } } } else { disallowQueryReason = fmt.Sprintf("not a serving tablet type(%v)", newTablet.Type) } if updateBlacklistedTables { if err := agent.loadBlacklistRules(newTablet, blacklistedTables); err != nil { // FIXME(alainjobart) how to handle this error? log.Errorf("Cannot update blacklisted tables rule: %v", err) } } if allowQuery { // Query service should be running. if oldTablet.Type == topodatapb.TabletType_REPLICA && newTablet.Type == topodatapb.TabletType_MASTER { // When promoting from replica to master, allow both master and replica // queries to be served during gracePeriod. if err := agent.QueryServiceControl.SetServingType(newTablet.Type, true, []topodatapb.TabletType{oldTablet.Type}); err != nil { log.Errorf("Can't start query service for MASTER+REPLICA mode: %v", err) } else { // If successful, broadcast to vtgate and then wait. agent.broadcastHealth() time.Sleep(*gracePeriod) } } if err := agent.allowQueries(newTablet.Type); err != nil { log.Errorf("Cannot start query service: %v", err) } } else { // Query service should be stopped. if (oldTablet.Type == topodatapb.TabletType_REPLICA || oldTablet.Type == topodatapb.TabletType_RDONLY) && newTablet.Type == topodatapb.TabletType_SPARE { // When a non-MASTER serving type is going SPARE, // put query service in lameduck during gracePeriod. agent.enterLameduck(disallowQueryReason) agent.broadcastHealth() time.Sleep(*gracePeriod) } agent.disallowQueries(newTablet.Type, disallowQueryReason) } // save the tabletControl we've been using, so the background // healthcheck makes the same decisions as we've been making. agent.setTabletControl(tabletControl) // update stream needs to be started or stopped too if topo.IsRunningUpdateStream(newTablet.Type) { agent.UpdateStream.Enable() } else { agent.UpdateStream.Disable() } // upate the stats to our current type if agent.exportStats { agent.statsTabletType.Set(strings.ToLower(newTablet.Type.String())) } // See if we need to start or stop any binlog player if agent.BinlogPlayerMap != nil { if newTablet.Type == topodatapb.TabletType_MASTER { agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, shardInfo) } else { agent.BinlogPlayerMap.StopAllPlayersAndReset() } } return nil }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record and tablet control agent.mutex.Lock() tablet := agent._tablet tabletControl := agent._tabletControl agent.mutex.Unlock() // figure out if we should be running the query service shouldQueryServiceBeRunning := false var blacklistedTables []string if topo.IsRunningQueryService(targetTabletType) && agent.BinlogPlayerMap.size() == 0 { shouldQueryServiceBeRunning = true if tabletControl != nil { blacklistedTables = tabletControl.BlacklistedTables if tabletControl.DisableQueryService { shouldQueryServiceBeRunning = false } } } // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } replicationDelay, err := agent.HealthReporter.Report(topo.IsSlaveType(typeForHealthCheck), shouldQueryServiceBeRunning) health := make(map[string]string) if err == nil { if replicationDelay > *unhealthyThreshold { err = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds()) } else if replicationDelay > *degradedThreshold { health[topo.ReplicationLag] = topo.ReplicationLagHigh } } // Figure out if we should be running QueryService, see if we are, // and reconcile. if err != nil { // we are not healthy, we should not be running QueryService shouldQueryServiceBeRunning = false } isQueryServiceRunning := agent.QueryServiceControl.IsServing() if shouldQueryServiceBeRunning { if !isQueryServiceRunning { // we remember this new possible error err = agent.allowQueries(tablet.Tablet, blacklistedTables) } } else { if isQueryServiceRunning { // we are not healthy or should not be running the // query service, shut it down. agent.disallowQueries() } } // save the health record record := &HealthRecord{ Error: err, ReplicationDelay: replicationDelay, Time: time.Now(), } agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.Portmap["mysql"]; !ok { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.MysqlDaemon.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) if err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *topo.Tablet) error { tablet.Portmap["mysql"] = mysqlPort return nil }); err != nil { log.Infof("Error updating mysql port in tablet record: %v", err) return } // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.Portmap["mysql"] = mysqlPort agent._waitingForMysql = false agent.mutex.Unlock() } } // remember our health status agent.mutex.Lock() agent._healthy = err agent._healthyTime = time.Now() agent._replicationDelay = replicationDelay agent.mutex.Unlock() // send it to our observers, after we've updated the tablet state // (Tablet is a pointer, and below we will alter the Tablet // record to be correct. hsr := &actionnode.HealthStreamReply{ Tablet: tablet.Tablet, BinlogPlayerMapSize: agent.BinlogPlayerMap.size(), ReplicationDelay: replicationDelay, } if err != nil { hsr.HealthError = err.Error() } defer agent.BroadcastHealthStreamReply(hsr) // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topo.TYPE_SPARE { // we only log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) agent.lastHealthMapCount.Set(int64(len(health))) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.batchCtx, agent.TopoServer, tablet.Alias, newTabletType, health); err != nil { log.Infof("Error updating tablet record: %v", err) return } tablet.Health = health tablet.Type = newTabletType // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.updateServingGraph(tablet, targetTabletType); err != nil { log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // run the post action callbacks, not much we can do with returned error if err := agent.refreshTablet(agent.batchCtx, "healthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }