// terminateHealthChecks is called when we enter lame duck mode. // We will clean up our state, and set query service to lame duck mode. // We only do something if we are in a serving state, and not a master. func (agent *ActionAgent) terminateHealthChecks() { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() log.Info("agent.terminateHealthChecks is starting") // read the current tablet record tablet := agent.Tablet() if !topo.IsSubjectToLameduck(tablet.Type) { // If we're MASTER, SPARE, WORKER, etc. then we // shouldn't enter lameduck. We do lameduck to not // trigger errors on clients. log.Infof("Tablet in state %v, not entering lameduck", tablet.Type) return } // Go lameduck for gracePeriod. // We've already checked above that we're not MASTER. // Enter new lameduck mode for gracePeriod, then shut down // queryservice. New lameduck mode means keep accepting // queries, but advertise unhealthy. After we return from // this synchronous OnTermSync hook, servenv may decide to // wait even longer, for the rest of the time specified by its // own "-lameduck-period" flag. During that extra period, // queryservice will be in old lameduck mode, meaning stay // alive but reject new queries. agent.lameduck("terminating healthchecks") // Note we only do this now if we entered lameduck. In the // master case for instance, we want to keep serving until // vttablet dies entirely (where else is the client going to // go?). After servenv lameduck, the queryservice is stopped // from a servenv.OnClose() hook anyway. log.Infof("Disabling query service after lameduck in terminating healthchecks") agent.QueryServiceControl.SetServingType(tablet.Type, false, nil) }
func (agent *ActionAgent) runHealthCheckLocked() { // read the current tablet record and tablet control agent.mutex.Lock() tablet := proto.Clone(agent._tablet).(*topodatapb.Tablet) shouldBeServing := agent._disallowQueryService == "" runUpdateStream := agent._enableUpdateStream ignoreErrorExpr := agent._ignoreHealthErrorExpr agent.mutex.Unlock() // run the health check record := &HealthRecord{} isSlaveType := true if tablet.Type == topodatapb.TabletType_MASTER { isSlaveType = false } // Remember the health error as healthErr to be sure we don't // accidentally overwrite it with some other err. replicationDelay, healthErr := agent.HealthReporter.Report(isSlaveType, shouldBeServing) if healthErr != nil && ignoreErrorExpr != nil && ignoreErrorExpr.MatchString(healthErr.Error()) { // we need to ignore this health error record.IgnoredError = healthErr record.IgnoreErrorExpr = ignoreErrorExpr.String() healthErr = nil } if healthErr == health.ErrSlaveNotRunning { // The slave is not running, so we just don't know the // delay. Use a maximum delay, so we can let vtgate // find the right replica, instead of erroring out. // (this works as the check below is a strict > operator). replicationDelay = *unhealthyThreshold healthErr = nil } if healthErr == nil { if replicationDelay > *unhealthyThreshold { healthErr = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds()) } } // Figure out if we should be running QueryService, see if we are, // and reconcile. if healthErr != nil { if tablet.Type != topodatapb.TabletType_WORKER { // We are not healthy and must shut down QueryService. // At the moment, the only exception to this are "worker" tablets which // still must serve queries e.g. as source tablet during a "SplitClone". shouldBeServing = false } } isServing := agent.QueryServiceControl.IsServing() if shouldBeServing { if !isServing { // If starting queryservice fails, that's our // new reason for being unhealthy. // // We don't care if the QueryService state actually // changed because we'll broadcast the latest health // status after this immediately anway. _ /* state changed */, healthErr = agent.QueryServiceControl.SetServingType(tablet.Type, true, nil) if healthErr == nil { // we were unhealthy, are now healthy, // make sure we have the right mysql port. if updatedTablet := agent.checkTabletMysqlPort(agent.batchCtx, tablet); updatedTablet != nil { agent.setTablet(updatedTablet) tablet = updatedTablet } } } } else { if isServing { // We are not healthy or should not be running // the query service. // First enter lameduck during gracePeriod to // limit client errors. if topo.IsSubjectToLameduck(tablet.Type) && *gracePeriod > 0 { agent.lameduck("health check failed") } // We don't care if the QueryService state actually // changed because we'll broadcast the latest health // status after this immediately anway. log.Infof("Disabling query service because of health-check failure: %v", healthErr) if _ /* state changed */, err := agent.QueryServiceControl.SetServingType(tablet.Type, false, nil); err != nil { log.Errorf("SetServingType(serving=false) failed: %v", err) } } } // change UpdateStream state if necessary if healthErr != nil { runUpdateStream = false } if topo.IsRunningUpdateStream(tablet.Type) && runUpdateStream { agent.UpdateStream.Enable() } else { agent.UpdateStream.Disable() } // save the health record record.Time = time.Now() record.Error = healthErr record.ReplicationDelay = replicationDelay agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.PortMap["mysql"]; !ok && !agent.skipMysqlPortCheck { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.MysqlDaemon.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) _, err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *topodatapb.Tablet) error { if err := topotools.CheckOwnership(agent.initialTablet, tablet); err != nil { return err } tablet.PortMap["mysql"] = mysqlPort return nil }) if err != nil { log.Infof("Error updating mysql port in tablet record (will try again at healthcheck interval): %v", err) } else { // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.PortMap["mysql"] = mysqlPort agent._waitingForMysql = false agent.mutex.Unlock() } } } // remember our health status agent.mutex.Lock() agent._healthy = healthErr agent._healthyTime = time.Now() agent._replicationDelay = replicationDelay agent.mutex.Unlock() // send it to our observers agent.broadcastHealth() }
// changeCallback is run after every action that might // have changed something in the tablet record or in the topology. // // It owns making changes to the BinlogPlayerMap. The input for this is the // tablet type (has to be master), and the shard's SourceShards. // // It owns updating the blacklisted tables. // // It owns updating the stats record for 'TabletType'. // // It owns starting and stopping the update stream service. // // It owns reading the TabletControl for the current tablet, and storing it. func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topodatapb.Tablet) { span := trace.NewSpanFromContext(ctx) span.StartLocal("ActionAgent.changeCallback") defer span.Finish() allowQuery := topo.IsRunningQueryService(newTablet.Type) broadcastHealth := false runUpdateStream := allowQuery // Read the shard to get SourceShards / TabletControlMap if // we're going to use it. var shardInfo *topo.ShardInfo var err error var disallowQueryReason string var blacklistedTables []string updateBlacklistedTables := true if allowQuery { shardInfo, err = agent.TopoServer.GetShard(ctx, newTablet.Keyspace, newTablet.Shard) if err != nil { log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err) updateBlacklistedTables = false } else { if newTablet.Type == topodatapb.TabletType_MASTER { if len(shardInfo.SourceShards) > 0 { allowQuery = false disallowQueryReason = "master tablet with filtered replication on" } } if tc := shardInfo.GetTabletControl(newTablet.Type); tc != nil { if topo.InCellList(newTablet.Alias.Cell, tc.Cells) { if tc.DisableQueryService { allowQuery = false disallowQueryReason = "TabletControl.DisableQueryService set" } blacklistedTables = tc.BlacklistedTables } } } } else { disallowQueryReason = fmt.Sprintf("not a serving tablet type(%v)", newTablet.Type) } agent.setServicesDesiredState(disallowQueryReason, runUpdateStream) if updateBlacklistedTables { if err := agent.loadBlacklistRules(newTablet, blacklistedTables); err != nil { // FIXME(alainjobart) how to handle this error? log.Errorf("Cannot update blacklisted tables rule: %v", err) } else { agent.setBlacklistedTables(blacklistedTables) } } if allowQuery { // Query service should be running. if oldTablet.Type == topodatapb.TabletType_REPLICA && newTablet.Type == topodatapb.TabletType_MASTER { // When promoting from replica to master, allow both master and replica // queries to be served during gracePeriod. if _, err := agent.QueryServiceControl.SetServingType(newTablet.Type, true, []topodatapb.TabletType{oldTablet.Type}); err == nil { // If successful, broadcast to vtgate and then wait. agent.broadcastHealth() time.Sleep(*gracePeriod) } else { log.Errorf("Can't start query service for MASTER+REPLICA mode: %v", err) } } if stateChanged, err := agent.QueryServiceControl.SetServingType(newTablet.Type, true, nil); err == nil { // If the state changed, broadcast to vtgate. // (e.g. this happens when the tablet was already master, but it just // changed from NOT_SERVING to SERVING due to // "vtctl MigrateServedFrom ... master".) if stateChanged { broadcastHealth = true } } else { runUpdateStream = false log.Errorf("Cannot start query service: %v", err) } } else { // Query service should be stopped. if topo.IsSubjectToLameduck(oldTablet.Type) && newTablet.Type == topodatapb.TabletType_SPARE && *gracePeriod > 0 { // When a non-MASTER serving type is going SPARE, // put query service in lameduck during gracePeriod. agent.lameduck(disallowQueryReason) } log.Infof("Disabling query service on type change, reason: %v", disallowQueryReason) if stateChanged, err := agent.QueryServiceControl.SetServingType(newTablet.Type, false, nil); err == nil { // If the state changed, broadcast to vtgate. // (e.g. this happens when the tablet was already master, but it just // changed from SERVING to NOT_SERVING because filtered replication was // enabled.) if stateChanged { broadcastHealth = true } } else { log.Errorf("SetServingType(serving=false) failed: %v", err) } } // update stream needs to be started or stopped too if topo.IsRunningUpdateStream(newTablet.Type) && runUpdateStream { agent.UpdateStream.Enable() } else { agent.UpdateStream.Disable() } // upate the stats to our current type if agent.exportStats { agent.statsTabletType.Set(topoproto.TabletTypeLString(newTablet.Type)) } // See if we need to start or stop any binlog player if agent.BinlogPlayerMap != nil { if newTablet.Type == topodatapb.TabletType_MASTER { agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, shardInfo) } else { agent.BinlogPlayerMap.StopAllPlayersAndReset() } } // Broadcast health changes to vtgate immediately. if broadcastHealth { agent.broadcastHealth() } }