func (agent *ActionAgent) runHealthCheckLocked() { // read the current tablet record and tablet control agent.mutex.Lock() tablet := proto.Clone(agent._tablet).(*topodatapb.Tablet) shouldBeServing := agent._disallowQueryService == "" runUpdateStream := agent._enableUpdateStream ignoreErrorExpr := agent._ignoreHealthErrorExpr agent.mutex.Unlock() // run the health check record := &HealthRecord{} isSlaveType := true if tablet.Type == topodatapb.TabletType_MASTER { isSlaveType = false } // Remember the health error as healthErr to be sure we don't // accidentally overwrite it with some other err. replicationDelay, healthErr := agent.HealthReporter.Report(isSlaveType, shouldBeServing) if healthErr != nil && ignoreErrorExpr != nil && ignoreErrorExpr.MatchString(healthErr.Error()) { // we need to ignore this health error record.IgnoredError = healthErr record.IgnoreErrorExpr = ignoreErrorExpr.String() healthErr = nil } if healthErr == health.ErrSlaveNotRunning { // The slave is not running, so we just don't know the // delay. Use a maximum delay, so we can let vtgate // find the right replica, instead of erroring out. // (this works as the check below is a strict > operator). replicationDelay = *unhealthyThreshold healthErr = nil } if healthErr == nil { if replicationDelay > *unhealthyThreshold { healthErr = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds()) } } // Figure out if we should be running QueryService, see if we are, // and reconcile. if healthErr != nil { if tablet.Type != topodatapb.TabletType_WORKER { // We are not healthy and must shut down QueryService. // At the moment, the only exception to this are "worker" tablets which // still must serve queries e.g. as source tablet during a "SplitClone". shouldBeServing = false } } isServing := agent.QueryServiceControl.IsServing() if shouldBeServing { if !isServing { // If starting queryservice fails, that's our // new reason for being unhealthy. // // We don't care if the QueryService state actually // changed because we'll broadcast the latest health // status after this immediately anway. _ /* state changed */, healthErr = agent.QueryServiceControl.SetServingType(tablet.Type, true, nil) if healthErr == nil { // we were unhealthy, are now healthy, // make sure we have the right mysql port. if updatedTablet := agent.checkTabletMysqlPort(agent.batchCtx, tablet); updatedTablet != nil { agent.setTablet(updatedTablet) tablet = updatedTablet } } } } else { if isServing { // We are not healthy or should not be running // the query service. // First enter lameduck during gracePeriod to // limit client errors. if topo.IsSubjectToLameduck(tablet.Type) && *gracePeriod > 0 { agent.lameduck("health check failed") } // We don't care if the QueryService state actually // changed because we'll broadcast the latest health // status after this immediately anway. log.Infof("Disabling query service because of health-check failure: %v", healthErr) if _ /* state changed */, err := agent.QueryServiceControl.SetServingType(tablet.Type, false, nil); err != nil { log.Errorf("SetServingType(serving=false) failed: %v", err) } } } // change UpdateStream state if necessary if healthErr != nil { runUpdateStream = false } if topo.IsRunningUpdateStream(tablet.Type) && runUpdateStream { agent.UpdateStream.Enable() } else { agent.UpdateStream.Disable() } // save the health record record.Time = time.Now() record.Error = healthErr record.ReplicationDelay = replicationDelay agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.PortMap["mysql"]; !ok && !agent.skipMysqlPortCheck { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.MysqlDaemon.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) _, err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *topodatapb.Tablet) error { if err := topotools.CheckOwnership(agent.initialTablet, tablet); err != nil { return err } tablet.PortMap["mysql"] = mysqlPort return nil }) if err != nil { log.Infof("Error updating mysql port in tablet record (will try again at healthcheck interval): %v", err) } else { // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.PortMap["mysql"] = mysqlPort agent._waitingForMysql = false agent.mutex.Unlock() } } } // remember our health status agent.mutex.Lock() agent._healthy = healthErr agent._healthyTime = time.Now() agent._replicationDelay = replicationDelay agent.mutex.Unlock() // send it to our observers agent.broadcastHealth() }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. // // This will not change the BinlogPlayerMap, but if it is not empty, // we will think we should not be running the query service. // // This will not change the TabletControl record, but will use it // to see if we should be running the query service. func (agent *ActionAgent) runHealthCheck(targetTabletType topodatapb.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record and tablet control agent.mutex.Lock() tablet := agent._tablet tabletControl := agent._tabletControl agent.mutex.Unlock() // figure out if we should be running the query service shouldBeServing := false if topo.IsRunningQueryService(targetTabletType) && !agent.BinlogPlayerMap.isRunningFilteredReplication() { shouldBeServing = true if tabletControl != nil { if tabletControl.DisableQueryService { shouldBeServing = false } } } // run the health check isSlaveType := true if tablet.Type == topodatapb.TabletType_MASTER { isSlaveType = false } replicationDelay, err := agent.HealthReporter.Report(isSlaveType, shouldBeServing) health := make(map[string]string) if err == nil { if replicationDelay > *unhealthyThreshold { err = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds()) } else if replicationDelay > *degradedThreshold { health[topo.ReplicationLag] = topo.ReplicationLagHigh } } agent.lastHealthMapCount.Set(int64(len(health))) // Figure out if we should be running QueryService, see if we are, // and reconcile. if err != nil { if tablet.Type != topodatapb.TabletType_WORKER { // We are not healthy and must shut down QueryService. // At the moment, the only exception to this are "worker" tablets which // still must serve queries e.g. as source tablet during a "SplitClone". shouldBeServing = false } } isServing := agent.QueryServiceControl.IsServing() if shouldBeServing { if !isServing { // send the type we want to be, not the type we are desiredType := tablet.Type if desiredType == topodatapb.TabletType_SPARE { desiredType = targetTabletType } // we remember this new possible error err = agent.allowQueries(desiredType) } } else { if isServing { // We are not healthy or should not be running the // query service, shut it down. // Note this is possibly sending 'spare' as // the tablet type, we will clean it up later. agent.disallowQueries(tablet.Tablet.Type, fmt.Sprintf("health-check failure(%v)", err), ) } } // save the health record record := &HealthRecord{ Error: err, ReplicationDelay: replicationDelay, Time: time.Now(), } agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.PortMap["mysql"]; !ok { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.MysqlDaemon.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) if err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *topodatapb.Tablet) error { if err := topotools.CheckOwnership(agent.initialTablet, tablet); err != nil { return err } tablet.PortMap["mysql"] = mysqlPort return nil }); err != nil { log.Infof("Error updating mysql port in tablet record, will try again: %v", err) return } // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.PortMap["mysql"] = mysqlPort agent._waitingForMysql = false agent.mutex.Unlock() } } // remember our health status agent.mutex.Lock() agent._healthy = err agent._healthyTime = time.Now() agent._replicationDelay = replicationDelay agent.mutex.Unlock() // send it to our observers agent.broadcastHealth() // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topodatapb.TabletType_SPARE { // we only log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topodatapb.TabletType_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topodatapb.TabletType_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && topo.IsHealthEqual(health, tablet.HealthMap) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.HealthMap, health) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeOwnType(agent.batchCtx, agent.TopoServer, agent.initialTablet, newTabletType, health); err != nil { log.Infof("Error updating tablet record: %v", err) return } tablet.HealthMap = health tablet.Type = newTabletType // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.updateServingGraph(tablet, targetTabletType); err != nil { log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // run the post action callbacks, not much we can do with returned error if err := agent.refreshTablet(agent.batchCtx, "healthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }