Example #1
0
// terminateHealthChecks is called when we enter lame duck mode.
// We will clean up our state, and set query service to lame duck mode.
// We only do something if we are in a serving state, and not a master.
func (agent *ActionAgent) terminateHealthChecks() {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()
	log.Info("agent.terminateHealthChecks is starting")

	// read the current tablet record
	tablet := agent.Tablet()
	if !topo.IsSubjectToLameduck(tablet.Type) {
		// If we're MASTER, SPARE, WORKER, etc. then we
		// shouldn't enter lameduck. We do lameduck to not
		// trigger errors on clients.
		log.Infof("Tablet in state %v, not entering lameduck", tablet.Type)
		return
	}

	// Go lameduck for gracePeriod.
	// We've already checked above that we're not MASTER.

	// Enter new lameduck mode for gracePeriod, then shut down
	// queryservice.  New lameduck mode means keep accepting
	// queries, but advertise unhealthy.  After we return from
	// this synchronous OnTermSync hook, servenv may decide to
	// wait even longer, for the rest of the time specified by its
	// own "-lameduck-period" flag. During that extra period,
	// queryservice will be in old lameduck mode, meaning stay
	// alive but reject new queries.
	agent.lameduck("terminating healthchecks")

	// Note we only do this now if we entered lameduck. In the
	// master case for instance, we want to keep serving until
	// vttablet dies entirely (where else is the client going to
	// go?).  After servenv lameduck, the queryservice is stopped
	// from a servenv.OnClose() hook anyway.
	log.Infof("Disabling query service after lameduck in terminating healthchecks")
	agent.QueryServiceControl.SetServingType(tablet.Type, false, nil)
}
Example #2
0
func (agent *ActionAgent) runHealthCheckLocked() {
	// read the current tablet record and tablet control
	agent.mutex.Lock()
	tablet := proto.Clone(agent._tablet).(*topodatapb.Tablet)
	shouldBeServing := agent._disallowQueryService == ""
	runUpdateStream := agent._enableUpdateStream
	ignoreErrorExpr := agent._ignoreHealthErrorExpr
	agent.mutex.Unlock()

	// run the health check
	record := &HealthRecord{}
	isSlaveType := true
	if tablet.Type == topodatapb.TabletType_MASTER {
		isSlaveType = false
	}

	// Remember the health error as healthErr to be sure we don't
	// accidentally overwrite it with some other err.
	replicationDelay, healthErr := agent.HealthReporter.Report(isSlaveType, shouldBeServing)
	if healthErr != nil && ignoreErrorExpr != nil &&
		ignoreErrorExpr.MatchString(healthErr.Error()) {
		// we need to ignore this health error
		record.IgnoredError = healthErr
		record.IgnoreErrorExpr = ignoreErrorExpr.String()
		healthErr = nil
	}
	if healthErr == health.ErrSlaveNotRunning {
		// The slave is not running, so we just don't know the
		// delay.  Use a maximum delay, so we can let vtgate
		// find the right replica, instead of erroring out.
		// (this works as the check below is a strict > operator).
		replicationDelay = *unhealthyThreshold
		healthErr = nil
	}
	if healthErr == nil {
		if replicationDelay > *unhealthyThreshold {
			healthErr = fmt.Errorf("reported replication lag: %v higher than unhealthy threshold: %v", replicationDelay.Seconds(), unhealthyThreshold.Seconds())
		}
	}

	// Figure out if we should be running QueryService, see if we are,
	// and reconcile.
	if healthErr != nil {
		if tablet.Type != topodatapb.TabletType_WORKER {
			// We are not healthy and must shut down QueryService.
			// At the moment, the only exception to this are "worker" tablets which
			// still must serve queries e.g. as source tablet during a "SplitClone".
			shouldBeServing = false
		}
	}
	isServing := agent.QueryServiceControl.IsServing()
	if shouldBeServing {
		if !isServing {
			// If starting queryservice fails, that's our
			// new reason for being unhealthy.
			//
			// We don't care if the QueryService state actually
			// changed because we'll broadcast the latest health
			// status after this immediately anway.
			_ /* state changed */, healthErr = agent.QueryServiceControl.SetServingType(tablet.Type, true, nil)

			if healthErr == nil {
				// we were unhealthy, are now healthy,
				// make sure we have the right mysql port.
				if updatedTablet := agent.checkTabletMysqlPort(agent.batchCtx, tablet); updatedTablet != nil {
					agent.setTablet(updatedTablet)
					tablet = updatedTablet
				}
			}
		}
	} else {
		if isServing {
			// We are not healthy or should not be running
			// the query service.

			// First enter lameduck during gracePeriod to
			// limit client errors.
			if topo.IsSubjectToLameduck(tablet.Type) && *gracePeriod > 0 {
				agent.lameduck("health check failed")
			}

			// We don't care if the QueryService state actually
			// changed because we'll broadcast the latest health
			// status after this immediately anway.
			log.Infof("Disabling query service because of health-check failure: %v", healthErr)
			if _ /* state changed */, err := agent.QueryServiceControl.SetServingType(tablet.Type, false, nil); err != nil {
				log.Errorf("SetServingType(serving=false) failed: %v", err)
			}
		}
	}

	// change UpdateStream state if necessary
	if healthErr != nil {
		runUpdateStream = false
	}
	if topo.IsRunningUpdateStream(tablet.Type) && runUpdateStream {
		agent.UpdateStream.Enable()
	} else {
		agent.UpdateStream.Disable()
	}

	// save the health record
	record.Time = time.Now()
	record.Error = healthErr
	record.ReplicationDelay = replicationDelay
	agent.History.Add(record)

	// try to figure out the mysql port if we don't have it yet
	if _, ok := tablet.PortMap["mysql"]; !ok && !agent.skipMysqlPortCheck {
		// we don't know the port, try to get it from mysqld
		mysqlPort, err := agent.MysqlDaemon.GetMysqlPort()
		if err != nil {
			// Don't log if we're already in a waiting-for-mysql state.
			agent.mutex.Lock()
			if !agent._waitingForMysql {
				log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err)
				agent._waitingForMysql = true
			}
			agent.mutex.Unlock()
		} else {
			log.Infof("Updating tablet mysql port to %v", mysqlPort)
			_, err := agent.TopoServer.UpdateTabletFields(agent.batchCtx, tablet.Alias,
				func(tablet *topodatapb.Tablet) error {
					if err := topotools.CheckOwnership(agent.initialTablet, tablet); err != nil {
						return err
					}
					tablet.PortMap["mysql"] = mysqlPort
					return nil
				})
			if err != nil {
				log.Infof("Error updating mysql port in tablet record (will try again at healthcheck interval): %v", err)
			} else {
				// save the port so we don't update it again next time
				// we do the health check.
				agent.mutex.Lock()
				agent._tablet.PortMap["mysql"] = mysqlPort
				agent._waitingForMysql = false
				agent.mutex.Unlock()
			}
		}
	}

	// remember our health status
	agent.mutex.Lock()
	agent._healthy = healthErr
	agent._healthyTime = time.Now()
	agent._replicationDelay = replicationDelay
	agent.mutex.Unlock()

	// send it to our observers
	agent.broadcastHealth()
}
Example #3
0
// changeCallback is run after every action that might
// have changed something in the tablet record or in the topology.
//
// It owns making changes to the BinlogPlayerMap. The input for this is the
// tablet type (has to be master), and the shard's SourceShards.
//
// It owns updating the blacklisted tables.
//
// It owns updating the stats record for 'TabletType'.
//
// It owns starting and stopping the update stream service.
//
// It owns reading the TabletControl for the current tablet, and storing it.
func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topodatapb.Tablet) {
	span := trace.NewSpanFromContext(ctx)
	span.StartLocal("ActionAgent.changeCallback")
	defer span.Finish()

	allowQuery := topo.IsRunningQueryService(newTablet.Type)
	broadcastHealth := false
	runUpdateStream := allowQuery

	// Read the shard to get SourceShards / TabletControlMap if
	// we're going to use it.
	var shardInfo *topo.ShardInfo
	var err error
	var disallowQueryReason string
	var blacklistedTables []string
	updateBlacklistedTables := true
	if allowQuery {
		shardInfo, err = agent.TopoServer.GetShard(ctx, newTablet.Keyspace, newTablet.Shard)
		if err != nil {
			log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err)
			updateBlacklistedTables = false
		} else {
			if newTablet.Type == topodatapb.TabletType_MASTER {
				if len(shardInfo.SourceShards) > 0 {
					allowQuery = false
					disallowQueryReason = "master tablet with filtered replication on"
				}
			}
			if tc := shardInfo.GetTabletControl(newTablet.Type); tc != nil {
				if topo.InCellList(newTablet.Alias.Cell, tc.Cells) {
					if tc.DisableQueryService {
						allowQuery = false
						disallowQueryReason = "TabletControl.DisableQueryService set"
					}
					blacklistedTables = tc.BlacklistedTables
				}
			}
		}
	} else {
		disallowQueryReason = fmt.Sprintf("not a serving tablet type(%v)", newTablet.Type)
	}
	agent.setServicesDesiredState(disallowQueryReason, runUpdateStream)
	if updateBlacklistedTables {
		if err := agent.loadBlacklistRules(newTablet, blacklistedTables); err != nil {
			// FIXME(alainjobart) how to handle this error?
			log.Errorf("Cannot update blacklisted tables rule: %v", err)
		} else {
			agent.setBlacklistedTables(blacklistedTables)
		}
	}

	if allowQuery {
		// Query service should be running.
		if oldTablet.Type == topodatapb.TabletType_REPLICA &&
			newTablet.Type == topodatapb.TabletType_MASTER {
			// When promoting from replica to master, allow both master and replica
			// queries to be served during gracePeriod.
			if _, err := agent.QueryServiceControl.SetServingType(newTablet.Type,
				true, []topodatapb.TabletType{oldTablet.Type}); err == nil {
				// If successful, broadcast to vtgate and then wait.
				agent.broadcastHealth()
				time.Sleep(*gracePeriod)
			} else {
				log.Errorf("Can't start query service for MASTER+REPLICA mode: %v", err)
			}
		}

		if stateChanged, err := agent.QueryServiceControl.SetServingType(newTablet.Type, true, nil); err == nil {
			// If the state changed, broadcast to vtgate.
			// (e.g. this happens when the tablet was already master, but it just
			// changed from NOT_SERVING to SERVING due to
			// "vtctl MigrateServedFrom ... master".)
			if stateChanged {
				broadcastHealth = true
			}
		} else {
			runUpdateStream = false
			log.Errorf("Cannot start query service: %v", err)
		}
	} else {
		// Query service should be stopped.
		if topo.IsSubjectToLameduck(oldTablet.Type) &&
			newTablet.Type == topodatapb.TabletType_SPARE &&
			*gracePeriod > 0 {
			// When a non-MASTER serving type is going SPARE,
			// put query service in lameduck during gracePeriod.
			agent.lameduck(disallowQueryReason)
		}

		log.Infof("Disabling query service on type change, reason: %v", disallowQueryReason)
		if stateChanged, err := agent.QueryServiceControl.SetServingType(newTablet.Type, false, nil); err == nil {
			// If the state changed, broadcast to vtgate.
			// (e.g. this happens when the tablet was already master, but it just
			// changed from SERVING to NOT_SERVING because filtered replication was
			// enabled.)
			if stateChanged {
				broadcastHealth = true
			}
		} else {
			log.Errorf("SetServingType(serving=false) failed: %v", err)
		}
	}

	// update stream needs to be started or stopped too
	if topo.IsRunningUpdateStream(newTablet.Type) && runUpdateStream {
		agent.UpdateStream.Enable()
	} else {
		agent.UpdateStream.Disable()
	}

	// upate the stats to our current type
	if agent.exportStats {
		agent.statsTabletType.Set(topoproto.TabletTypeLString(newTablet.Type))
	}

	// See if we need to start or stop any binlog player
	if agent.BinlogPlayerMap != nil {
		if newTablet.Type == topodatapb.TabletType_MASTER {
			agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, shardInfo)
		} else {
			agent.BinlogPlayerMap.StopAllPlayersAndReset()
		}
	}

	// Broadcast health changes to vtgate immediately.
	if broadcastHealth {
		agent.broadcastHealth()
	}
}