// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record and blacklisted tables agent.mutex.Lock() tablet := agent._tablet blacklistedTables := agent._blacklistedTables agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) // Figure out if we should be running QueryService. If we should, // and we aren't, and we're otherwise healthy, try to start it. if err == nil && topo.IsRunningQueryService(targetTabletType) && agent.BinlogPlayerMap.size() == 0 { err = agent.allowQueries(tablet.Tablet, blacklistedTables) } // save the health record record := &HealthRecord{ Error: err, Result: health, Time: time.Now(), } agent.History.Add(record) // try to figure out the mysql port if we don't have it yet if _, ok := tablet.Portmap["mysql"]; !ok { // we don't know the port, try to get it from mysqld mysqlPort, err := agent.Mysqld.GetMysqlPort() if err != nil { // Don't log if we're already in a waiting-for-mysql state. agent.mutex.Lock() if !agent._waitingForMysql { log.Warningf("Can't get mysql port, won't populate Tablet record in topology (will retry silently at healthcheck interval %v): %v", *healthCheckInterval, err) agent._waitingForMysql = true } agent.mutex.Unlock() } else { log.Infof("Updating tablet mysql port to %v", mysqlPort) if err := agent.TopoServer.UpdateTabletFields(tablet.Alias, func(tablet *topo.Tablet) error { tablet.Portmap["mysql"] = mysqlPort return nil }); err != nil { log.Infof("Error updating mysql port in tablet record: %v", err) return } // save the port so we don't update it again next time // we do the health check. agent.mutex.Lock() agent._tablet.Portmap["mysql"] = mysqlPort agent._waitingForMysql = false agent.mutex.Unlock() } } // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topo.TYPE_SPARE { // we only log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) agent.lastHealthMapCount.Set(int64(len(health))) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.rebuildShardIfNeeded(tablet, targetTabletType); err != nil { log.Warningf("rebuildShardIfNeeded failed (will still run post action callbacks, serving graph might be out of date): %v", err) } // run the post action callbacks, not much we can do with returned error if err := agent.refreshTablet("healthcheck"); err != nil { log.Warningf("refreshTablet failed: %v", err) } }
// RunHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) RunHealthCheck(targetTabletType topo.TabletType, lockTimeout time.Duration) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record agent.mutex.Lock() tablet := agent._tablet agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) // start with no change newTabletType := tablet.Type if err != nil { if tablet.Type != targetTabletType { log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) return } log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if topo.IsInServingGraph(targetTabletType) { // TODO: interrupted may need to be a global one closed when we exit interrupted := make(chan struct{}) if *topotools.UseSrvShardLocks { // no need to take the shard lock in this case if err := topotools.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topotools.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}, lockTimeout, interrupted); err != nil { log.Warningf("topotools.RebuildShard returned an error: %v", err) return } } else { actionNode := actionnode.RebuildShard() lockPath, err := actionNode.LockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockTimeout, interrupted) if err != nil { log.Warningf("Cannot lock shard for rebuild: %v", err) return } err = topotools.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topotools.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}, lockTimeout, interrupted) err = actionNode.UnlockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockPath, err) if err != nil { log.Warningf("UnlockShard returned an error: %v", err) return } } } // run the post action callbacks agent.afterAction("healthcheck", false /* reloadSchema */) }
// runHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType, lockTimeout time.Duration) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record agent.mutex.Lock() tablet := agent._tablet agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) // Figure out if we should be running QueryService. If we should, // and we aren't, and we're otherwise healthy, try to start it if err == nil && topo.IsRunningQueryService(targetTabletType) { err = agent.allowQueries(tablet.Tablet) } // save the health record record := &HealthRecord{ Error: err, Result: health, Time: time.Now(), } agent.History.Add(record) // Update our topo.Server state, start with no change newTabletType := tablet.Type if err != nil { // The tablet is not healthy, let's see what we need to do if tablet.Type != targetTabletType { if tablet.Type != topo.TYPE_SPARE { // we onyl log if we're not in spare, // as the spare state is normal for a // failed health check. log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) } return } // Note that if the query service is running, we may // need to stop it. The post-action callback will do // it, and it will be done after we change our state, // so it's the right order, let it do it. log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) agent.lastHealthMapCount.Set(int64(len(health))) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if err := agent.rebuildShardIfNeeded(tablet, targetTabletType, lockTimeout); err != nil { log.Warningf("rebuildShardIfNeeded failed, not running post action callbacks: %v", err) return } // run the post action callbacks agent.afterAction("healthcheck", false /* reloadSchema */) }
// RunHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) RunHealthCheck(targetTabletType topo.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record agent.mutex.Lock() tablet := agent._tablet agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) if len(health) == 0 { health = nil } // start with no change newTabletType := tablet.Type if err != nil { if tablet.Type != targetTabletType { log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) return } log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && reflect.DeepEqual(health, tablet.Health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) } // Change the Type, update the health if err := ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell. // TODO: timeout should be configurable // TODO: interrupted may need to be a global one closed when we exit interrupted := make(chan struct{}) actionNode := actionnode.RebuildShard() lockPath, err := actionNode.LockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, 5*time.Second, interrupted) if err != nil { log.Warningf("Cannot lock shard for rebuild: %v", err) return } err = topo.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topo.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}) err = actionNode.UnlockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockPath, err) if err != nil { log.Warningf("UnlockShard returned an error: %v", err) return } // run the post action callbacks agent.afterAction("healthcheck", false /* reloadSchema */) }