Ejemplo n.º 1
0
func (agent *ActionAgent) runHealthCheckLocked() {
	// read the current tablet record and tablet control
	agent.mutex.Lock()
	tablet := proto.Clone(agent._tablet).(*topodatapb.Tablet)
	shouldBeServing := agent._disallowQueryService == ""
	runUpdateStream := agent._enableUpdateStream
	ignoreErrorExpr := agent._ignoreHealthErrorExpr
	agent.mutex.Unlock()

	// run the health check
	record := &HealthRecord{}
	isSlaveType := true
	if tablet.Type == topodatapb.TabletType_MASTER {
		isSlaveType = false
	}

	// Remember the health error as healthErr to be sure we don't
	// accidentally overwrite it with some other err.
	replicationDelay, healthErr := agent.HealthReporter.Report(isSlaveType, shouldBeServing)
	if healthErr != nil && ignoreErrorExpr != nil &&
		ignoreErrorExpr.MatchString(healthErr.Error()) {
		// we need to ignore this health error
		record.IgnoredError = healthErr
		record.IgnoreErrorExpr = ignoreErrorExpr.String()
		healthErr = nil
	}
	if healthErr == health.ErrSlaveNotRunning {
		// The slave is not running, so we just don't know the
		// delay.  Use a maximum delay, so we can let vtgate
		// find the right replica, instead of erroring out.
		// (this works as the check below is a strict > operator).
		replicationDelay = *unhealthyThreshold
		healthErr = nil
	}
	if healthErr == nil {
		if replicationDelay > *unhealthyThreshold {
			healthErr = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds())
		}
	}

	// Figure out if we should be running QueryService, see if we are,
	// and reconcile.
	if healthErr != nil {
		if tablet.Type != topodatapb.TabletType_WORKER {
			// We are not healthy and must shut down QueryService.
			// At the moment, the only exception to this are "worker" tablets which
			// still must serve queries e.g. as source tablet during a "SplitClone".
			shouldBeServing = false
		}
	}
	isServing := agent.QueryServiceControl.IsServing()
	if shouldBeServing {
		if !isServing {
			// If starting queryservice fails, that's our
			// new reason for being unhealthy.
			//
			// We don't care if the QueryService state actually
			// changed because we'll broadcast the latest health
			// status after this immediately anway.
			_ /* state changed */, healthErr = agent.QueryServiceControl.SetServingType(tablet.Type, true, nil)

			if healthErr == nil {
				// we were unhealthy, are now healthy,
				// make sure we have the right mysql port.
				if updatedTablet := agent.checkTabletMysqlPort(agent.batchCtx, tablet); updatedTablet != nil {
					agent.setTablet(updatedTablet)
					tablet = updatedTablet
				}
			}
		}
	} else {
		if isServing {
			// We are not healthy or should not be running
			// the query service.

			// First enter lameduck during gracePeriod to
			// limit client errors.
			if topo.IsSubjectToLameduck(tablet.Type) && *gracePeriod > 0 {
				agent.lameduck("health check failed")
			}

			// We don't care if the QueryService state actually
			// changed because we'll broadcast the latest health
			// status after this immediately anway.
			log.Infof("Disabling query service because of health-check failure: %v", healthErr)
			if _ /* state changed */, err := agent.QueryServiceControl.SetServingType(tablet.Type, false, nil); err != nil {
				log.Errorf("SetServingType(serving=false) failed: %v", err)
			}
		}
	}

	// change UpdateStream state if necessary
	if healthErr != nil {
		runUpdateStream = false
	}
	if topo.IsRunningUpdateStream(tablet.Type) && runUpdateStream {
		agent.UpdateStream.Enable()
	} else {
		agent.UpdateStream.Disable()
	}

	// save the health record
	record.Time = time.Now()
	record.Error = healthErr
	record.ReplicationDelay = replicationDelay
	agent.History.Add(record)

	// try to figure out the mysql port if we don't have it yet
	if _, ok := tablet.PortMap["mysql"]; !ok && !agent.skipMysqlPortCheck {
		// we don't know the port, try to get it from mysqld
		mysqlPort, err := agent.MysqlDaemon.GetMysqlPort()
		if err != nil {
			// Don't log if we're already in a waiting-for-mysql state.
			agent.mutex.Lock()
			if !agent._waitingForMysql {
				log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err)
				agent._waitingForMysql = true
			}
			agent.mutex.Unlock()
		} else {
			log.Infof("Updating tablet mysql port to %v", mysqlPort)
			_, err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias,
				func(tablet *topodatapb.Tablet) error {
					if err := topotools.CheckOwnership(agent.initialTablet, tablet); err != nil {
						return err
					}
					tablet.PortMap["mysql"] = mysqlPort
					return nil
				})
			if err != nil {
				log.Infof("Error updating mysql port in tablet record (will try again at healthcheck interval): %v", err)
			} else {
				// save the port so we don't update it again next time
				// we do the health check.
				agent.mutex.Lock()
				agent._tablet.PortMap["mysql"] = mysqlPort
				agent._waitingForMysql = false
				agent.mutex.Unlock()
			}
		}
	}

	// remember our health status
	agent.mutex.Lock()
	agent._healthy = healthErr
	agent._healthyTime = time.Now()
	agent._replicationDelay = replicationDelay
	agent.mutex.Unlock()

	// send it to our observers
	agent.broadcastHealth()
}
Ejemplo n.º 2
0
// runHealthCheck takes the action mutex, runs the health check,
// and if we need to change our state, do it.
// If we are the master, we don't change our type, healthy or not.
// If we are not the master, we change to spare if not healthy,
// or to the passed in targetTabletType if healthy.
//
// Note we only update the topo record if we need to, that is if our type or
// health details changed.
//
// This will not change the BinlogPlayerMap, but if it is not empty,
// we will think we should not be running the query service.
//
// This will not change the TabletControl record, but will use it
// to see if we should be running the query service.
func (agent *ActionAgent) runHealthCheck(targetTabletType topodatapb.TabletType) {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()

	// read the current tablet record and tablet control
	agent.mutex.Lock()
	tablet := agent._tablet
	tabletControl := agent._tabletControl
	agent.mutex.Unlock()

	// figure out if we should be running the query service
	shouldBeServing := false
	if topo.IsRunningQueryService(targetTabletType) && !agent.BinlogPlayerMap.isRunningFilteredReplication() {
		shouldBeServing = true
		if tabletControl != nil {
			if tabletControl.DisableQueryService {
				shouldBeServing = false
			}
		}
	}

	// run the health check
	isSlaveType := true
	if tablet.Type == topodatapb.TabletType_MASTER {
		isSlaveType = false
	}
	replicationDelay, err := agent.HealthReporter.Report(isSlaveType, shouldBeServing)
	health := make(map[string]string)
	if err == nil {
		if replicationDelay > *unhealthyThreshold {
			err = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds())
		} else if replicationDelay > *degradedThreshold {
			health[topo.ReplicationLag] = topo.ReplicationLagHigh
		}
	}
	agent.lastHealthMapCount.Set(int64(len(health)))

	// Figure out if we should be running QueryService, see if we are,
	// and reconcile.
	if err != nil {
		if tablet.Type != topodatapb.TabletType_WORKER {
			// We are not healthy and must shut down QueryService.
			// At the moment, the only exception to this are "worker" tablets which
			// still must serve queries e.g. as source tablet during a "SplitClone".
			shouldBeServing = false
		}
	}
	isServing := agent.QueryServiceControl.IsServing()
	if shouldBeServing {
		if !isServing {
			// send the type we want to be, not the type we are
			desiredType := tablet.Type
			if desiredType == topodatapb.TabletType_SPARE {
				desiredType = targetTabletType
			}

			// we remember this new possible error
			err = agent.allowQueries(desiredType)
		}
	} else {
		if isServing {
			// We are not healthy or should not be running the
			// query service, shut it down.
			// Note this is possibly sending 'spare' as
			// the tablet type, we will clean it up later.
			agent.disallowQueries(tablet.Tablet.Type,
				fmt.Sprintf("health-check failure(%v)", err),
			)
		}
	}

	// save the health record
	record := &HealthRecord{
		Error:            err,
		ReplicationDelay: replicationDelay,
		Time:             time.Now(),
	}
	agent.History.Add(record)

	// try to figure out the mysql port if we don't have it yet
	if _, ok := tablet.PortMap["mysql"]; !ok {
		// we don't know the port, try to get it from mysqld
		mysqlPort, err := agent.MysqlDaemon.GetMysqlPort()
		if err != nil {
			// Don't log if we're already in a waiting-for-mysql state.
			agent.mutex.Lock()
			if !agent._waitingForMysql {
				log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err)
				agent._waitingForMysql = true
			}
			agent.mutex.Unlock()
		} else {
			log.Infof("Updating tablet mysql port to %v", mysqlPort)
			if err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias, func(tablet *topodatapb.Tablet) error {
				if err := topotools.CheckOwnership(agent.initialTablet, tablet); err != nil {
					return err
				}
				tablet.PortMap["mysql"] = mysqlPort
				return nil
			}); err != nil {
				log.Infof("Error updating mysql port in tablet record, will try again: %v", err)
				return
			}

			// save the port so we don't update it again next time
			// we do the health check.
			agent.mutex.Lock()
			agent._tablet.PortMap["mysql"] = mysqlPort
			agent._waitingForMysql = false
			agent.mutex.Unlock()
		}
	}

	// remember our health status
	agent.mutex.Lock()
	agent._healthy = err
	agent._healthyTime = time.Now()
	agent._replicationDelay = replicationDelay
	agent.mutex.Unlock()

	// send it to our observers
	agent.broadcastHealth()

	// Update our topo.Server state, start with no change
	newTabletType := tablet.Type
	if err != nil {
		// The tablet is not healthy, let's see what we need to do
		if tablet.Type != targetTabletType {
			if tablet.Type != topodatapb.TabletType_SPARE {
				// we only log if we're not in spare,
				// as the spare state is normal for a
				// failed health check.
				log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err)
			}
			return
		}

		// Note that if the query service is running, we may
		// need to stop it. The post-action callback will do
		// it, and it will be done after we change our state,
		// so it's the right order, let it do it.
		log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err)
		newTabletType = topodatapb.TabletType_SPARE
	} else {
		// We are healthy, maybe with health, see if we need
		// to update the record. We only change from spare to
		// our target type.
		if tablet.Type == topodatapb.TabletType_SPARE {
			newTabletType = targetTabletType
		}
		if tablet.Type == newTabletType && topo.IsHealthEqual(health, tablet.HealthMap) {
			// no change in health, not logging anything,
			// and we're done
			return
		}

		// we need to update our state
		log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.HealthMap, health)
	}

	// Change the Type, update the health. Note we pass in a map
	// that's not nil, meaning if it's empty, we will clear it.
	if err := topotools.ChangeOwnType(agent.batchCtx, agent.TopoServer, agent.initialTablet, newTabletType, health); err != nil {
		log.Infof("Error updating tablet record: %v", err)
		return
	}
	tablet.HealthMap = health
	tablet.Type = newTabletType

	// Rebuild the serving graph in our cell, only if we're dealing with
	// a serving type
	if err := agent.updateServingGraph(tablet, targetTabletType); err != nil {
		log.Warningf("updateServingGraph failed (will still run post action callbacks, serving graph might be out of date): %v", err)
	}

	// run the post action callbacks, not much we can do with returned error
	if err := agent.refreshTablet(agent.batchCtx, "healthcheck"); err != nil {
		log.Warningf("refreshTablet failed: %v", err)
	}
}