// SeedOperationState reads states for a given seed operation func ReadSeedStates(seedId int64) ([]SeedOperationState, error) { res := []SeedOperationState{} query := ` select agent_seed_state_id, agent_seed_id, state_timestamp, state_action, error_message from agent_seed_state where agent_seed_id = ? order by agent_seed_state_id desc ` err := db.QueryOrchestrator(query, sqlutils.Args(seedId), func(m sqlutils.RowMap) error { seedState := SeedOperationState{} seedState.SeedStateId = m.GetInt64("agent_seed_state_id") seedState.SeedId = m.GetInt64("agent_seed_id") seedState.StateTimestamp = m.GetString("state_timestamp") seedState.Action = m.GetString("state_action") seedState.ErrorMessage = m.GetString("error_message") res = append(res, seedState) return nil }) if err != nil { log.Errore(err) } return res, err }
// readAgentBasicInfo returns the basic data for an agent directly from backend table (no agent access) func readAgentBasicInfo(hostname string) (Agent, string, error) { agent := Agent{} token := "" query := ` select hostname, port, token, last_submitted, mysql_port from host_agent where hostname = ? ` err := db.QueryOrchestrator(query, sqlutils.Args(hostname), func(m sqlutils.RowMap) error { agent.Hostname = m.GetString("hostname") agent.Port = m.GetInt("port") agent.LastSubmitted = m.GetString("last_submitted") agent.MySQLPort = m.GetInt64("mysql_port") token = m.GetString("token") return nil }) if err != nil { return agent, "", err } if token == "" { return agent, "", log.Errorf("Cannot get agent/token: %s", hostname) } return agent, token, nil }
func ReadAvailableNodes(onlyHttpNodes bool) ([]string, error) { res := []string{} extraInfo := "" if onlyHttpNodes { extraInfo = string(OrchestratorExecutionHttpMode) } query := ` select concat(hostname, ';', token, ';', app_version) as node from node_health where last_seen_active > now() - interval ? second and ? in (extra_info, '') order by hostname ` err := db.QueryOrchestrator(query, sqlutils.Args(registrationPollSeconds*2, extraInfo), func(m sqlutils.RowMap) error { res = append(res, m.GetString("node")) return nil }) if err != nil { log.Errore(err) } return res, err }
func ReadClusterPoolInstances(clusterName string) (*PoolInstancesMap, error) { var poolInstancesMap = make(PoolInstancesMap) query := ` select database_instance_pool.* from database_instance join database_instance_pool using (hostname, port) where database_instance.cluster_name = ? ` err := db.QueryOrchestrator(query, sqlutils.Args(clusterName), func(m sqlutils.RowMap) error { pool := m.GetString("pool") hostname := m.GetString("hostname") port := m.GetInt("port") if _, ok := poolInstancesMap[pool]; !ok { poolInstancesMap[pool] = [](*InstanceKey){} } poolInstancesMap[pool] = append(poolInstancesMap[pool], &InstanceKey{Hostname: hostname, Port: port}) return nil }) if err != nil { return nil, err } return &poolInstancesMap, nil }
// ReadMaintenanceInstanceKey will return the instanceKey for active maintenance by maintenanceToken func ReadMaintenanceInstanceKey(maintenanceToken int64) (*InstanceKey, error) { var res *InstanceKey query := ` select hostname, port from database_instance_maintenance where database_instance_maintenance_id = ? ` err := db.QueryOrchestrator(query, sqlutils.Args(maintenanceToken), func(m sqlutils.RowMap) error { instanceKey, merr := NewInstanceKeyFromStrings(m.GetString("hostname"), m.GetString("port")) if merr != nil { return merr } res = instanceKey return nil }) if err != nil { log.Errore(err) } return res, err }
func getHostAttributesByClause(whereClause string, args []interface{}) ([]HostAttributes, error) { res := []HostAttributes{} query := fmt.Sprintf(` select hostname, attribute_name, attribute_value, submit_timestamp , ifnull(expire_timestamp, '') as expire_timestamp from host_attributes %s order by hostname, attribute_name `, whereClause) err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { hostAttributes := HostAttributes{} hostAttributes.Hostname = m.GetString("hostname") hostAttributes.AttributeName = m.GetString("attribute_name") hostAttributes.AttributeValue = m.GetString("attribute_value") hostAttributes.SubmitTimestamp = m.GetString("submit_timestamp") hostAttributes.ExpireTimestamp = m.GetString("expire_timestamp") res = append(res, hostAttributes) return nil }) if err != nil { log.Errore(err) } return res, err }
// ReadLongRunningProcesses returns the list of current known long running processes of all instances func ReadLongRunningProcesses(filter string) ([]Process, error) { longRunningProcesses := []Process{} if filter != "" { filter = "%" + filter + "%" } else { filter = "%" } query := ` select hostname, port, process_id, process_started_at, process_user, process_host, process_db, process_command, process_time_seconds, process_state, process_info from database_instance_long_running_queries where hostname like ? or process_user like ? or process_host like ? or process_db like ? or process_command like ? or process_state like ? or process_info like ? order by process_time_seconds desc ` args := sqlutils.Args(filter, filter, filter, filter, filter, filter, filter) err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { process := Process{} process.InstanceHostname = m.GetString("hostname") process.InstancePort = m.GetInt("port") process.Id = m.GetInt64("process_id") process.User = m.GetString("process_user") process.Host = m.GetString("process_host") process.Db = m.GetString("process_db") process.Command = m.GetString("process_command") process.Time = m.GetInt64("process_time_seconds") process.State = m.GetString("process_state") process.Info = m.GetString("process_info") process.StartedAt = m.GetString("process_started_at") longRunningProcesses = append(longRunningProcesses, process) return nil }) if err != nil { log.Errore(err) } return longRunningProcesses, err }
func GetEquivalentMasterCoordinates(instanceCoordinates *InstanceBinlogCoordinates) (result [](*InstanceBinlogCoordinates), err error) { query := ` select master1_hostname as hostname, master1_port as port, master1_binary_log_file as binlog_file, master1_binary_log_pos as binlog_pos from master_position_equivalence where master2_hostname = ? and master2_port = ? and master2_binary_log_file = ? and master2_binary_log_pos = ? union select master2_hostname as hostname, master2_port as port, master2_binary_log_file as binlog_file, master2_binary_log_pos as binlog_pos from master_position_equivalence where master1_hostname = ? and master1_port = ? and master1_binary_log_file = ? and master1_binary_log_pos = ? ` args := sqlutils.Args( instanceCoordinates.Key.Hostname, instanceCoordinates.Key.Port, instanceCoordinates.Coordinates.LogFile, instanceCoordinates.Coordinates.LogPos, instanceCoordinates.Key.Hostname, instanceCoordinates.Key.Port, instanceCoordinates.Coordinates.LogFile, instanceCoordinates.Coordinates.LogPos, ) err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { equivalentCoordinates := InstanceBinlogCoordinates{} equivalentCoordinates.Key.Hostname = m.GetString("hostname") equivalentCoordinates.Key.Port = m.GetInt("port") equivalentCoordinates.Coordinates.LogFile = m.GetString("binlog_file") equivalentCoordinates.Coordinates.LogPos = m.GetInt64("binlog_pos") result = append(result, &equivalentCoordinates) return nil }) if err != nil { return nil, err } return result, nil }
// readRecoveries reads recovery entry/audit entires from topology_recovery func readFailureDetections(whereCondition string, limit string, args []interface{}) ([]TopologyRecovery, error) { res := []TopologyRecovery{} query := fmt.Sprintf(` select detection_id, hostname, port, in_active_period as is_active, start_active_period, end_active_period_unixtime, processing_node_hostname, processcing_node_token, analysis, cluster_name, cluster_alias, count_affected_slaves, slave_hosts, (select max(recovery_id) from topology_recovery where topology_recovery.last_detection_id = detection_id) as related_recovery_id from topology_failure_detection %s order by detection_id desc %s `, whereCondition, limit) err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { failureDetection := TopologyRecovery{} failureDetection.Id = m.GetInt64("detection_id") failureDetection.IsActive = m.GetBool("is_active") failureDetection.RecoveryStartTimestamp = m.GetString("start_active_period") failureDetection.ProcessingNodeHostname = m.GetString("processing_node_hostname") failureDetection.ProcessingNodeToken = m.GetString("processcing_node_token") failureDetection.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname") failureDetection.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port") failureDetection.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis")) failureDetection.AnalysisEntry.ClusterDetails.ClusterName = m.GetString("cluster_name") failureDetection.AnalysisEntry.ClusterDetails.ClusterAlias = m.GetString("cluster_alias") failureDetection.AnalysisEntry.CountSlaves = m.GetUint("count_affected_slaves") failureDetection.AnalysisEntry.ReadSlaveHostsFromString(m.GetString("slave_hosts")) failureDetection.RelatedRecoveryId = m.GetInt64("related_recovery_id") failureDetection.AnalysisEntry.ClusterDetails.ReadRecoveryInfo() res = append(res, failureDetection) return nil }) if err != nil { log.Errore(err) } return res, err }
func ReadPendingAsyncRequests(limit int) (res [](*AsyncRequest), err error) { limitClause := `` args := sqlutils.Args() if limit > 0 { limitClause = `limit ?` args = append(args, limit) } query := fmt.Sprintf(` select request_id, command, hostname, port, destination_hostname, destination_port, pattern, gtid_hint, story from async_request where begin_timestamp IS NULL order by request_id asc %s `, limitClause) err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { asyncRequest := NewEmptyAsyncRequest() asyncRequest.Id = m.GetInt64("request_id") asyncRequest.Command = m.GetString("command") asyncRequest.OperatedInstanceKey = &inst.InstanceKey{} asyncRequest.OperatedInstanceKey.Hostname = m.GetString("hostname") asyncRequest.OperatedInstanceKey.Port = m.GetInt("port") if m.GetString("destination_hostname") != "" { asyncRequest.DestinationKey = &inst.InstanceKey{} asyncRequest.DestinationKey.Hostname = m.GetString("destination_hostname") asyncRequest.DestinationKey.Port = m.GetInt("destination_port") } asyncRequest.Pattern = m.GetString("pattern") asyncRequest.GTIDHint = inst.OperationGTIDHint(m.GetString("gtid_hint")) asyncRequest.Story = m.GetString("story") res = append(res, asyncRequest) return nil }) if err != nil { log.Errore(err) } return res, err }
// ReadAliasByClusterName returns the cluster alias for the given cluster name, // or the cluster name itself if not explicit alias found func ReadAliasByClusterName(clusterName string) (alias string, err error) { alias = clusterName // default return value query := ` select alias from cluster_alias where cluster_name = ? ` err = db.QueryOrchestrator(query, sqlutils.Args(clusterName), func(m sqlutils.RowMap) error { alias = m.GetString("alias") return nil }) return clusterName, err }
// ReadRecentAudit returns a list of audit entries order chronologically descending, using page number. func ReadRecentAudit(instanceKey *InstanceKey, page int) ([]Audit, error) { res := []Audit{} args := sqlutils.Args() whereCondition := `` if instanceKey != nil { whereCondition = `where hostname=? and port=?` args = append(args, instanceKey.Hostname, instanceKey.Port) } query := fmt.Sprintf(` select audit_id, audit_timestamp, audit_type, hostname, port, message from audit %s order by audit_timestamp desc limit ? offset ? `, whereCondition) args = append(args, config.Config.AuditPageSize, page*config.Config.AuditPageSize) err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { audit := Audit{} audit.AuditId = m.GetInt64("audit_id") audit.AuditTimestamp = m.GetString("audit_timestamp") audit.AuditType = m.GetString("audit_type") audit.AuditInstanceKey.Hostname = m.GetString("hostname") audit.AuditInstanceKey.Port = m.GetInt("port") audit.Message = m.GetString("message") res = append(res, audit) return nil }) if err != nil { log.Errore(err) } return res, err }
// TokenIsValid checks to see whether a given token exists and is not outdated. func TokenIsValid(publicToken string, secretToken string) (result bool, err error) { query := ` select count(*) as valid_token from access_token where public_token=? and secret_token=? and ( generated_at >= now() - interval ? minute or is_reentrant = 1 ) ` err = db.QueryOrchestrator(query, sqlutils.Args(publicToken, secretToken, config.Config.AccessTokenExpiryMinutes), func(m sqlutils.RowMap) error { result = m.GetInt("valid_token") > 0 return nil }) return result, log.Errore(err) }
func TokenBelongsToHealthyHttpService(token string) (result bool, err error) { extraInfo := string(OrchestratorExecutionHttpMode) query := ` select token from node_health where and token = ? and extra_info = ? ` err = db.QueryOrchestrator(query, sqlutils.Args(token, extraInfo), func(m sqlutils.RowMap) error { // Row exists? We're happy result = true return nil }) return result, log.Errore(err) }
// ReadOutdatedAgentsHosts returns agents that need to be updated func ReadOutdatedAgentsHosts() ([]string, error) { res := []string{} query := ` select hostname from host_agent where IFNULL(last_checked < now() - interval ? minute, true) ` err := db.QueryOrchestrator(query, sqlutils.Args(config.Config.AgentPollMinutes), func(m sqlutils.RowMap) error { hostname := m.GetString("hostname") res = append(res, hostname) return nil }) if err != nil { log.Errore(err) } return res, err }
// ReadClusterPoolInstances reads cluster-pool-instance associationsfor given cluster and pool func ReadClusterPoolInstances(clusterName string, pool string) (result [](*ClusterPoolInstance), err error) { args := sqlutils.Args() whereClause := `` if clusterName != "" { whereClause = ` where database_instance.cluster_name = ? and ? in ('', pool) ` args = append(args, clusterName, pool) } query := fmt.Sprintf(` select cluster_name, ifnull(alias, cluster_name) as alias, database_instance_pool.* from database_instance join database_instance_pool using (hostname, port) left join cluster_alias using (cluster_name) %s `, whereClause) err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { clusterPoolInstance := ClusterPoolInstance{ ClusterName: m.GetString("cluster_name"), ClusterAlias: m.GetString("alias"), Pool: m.GetString("pool"), Hostname: m.GetString("hostname"), Port: m.GetInt("port"), } result = append(result, &clusterPoolInstance) return nil }) if err != nil { return nil, err } return result, nil }
// ReadBlockedRecoveries reads blocked recovery entries, potentially filtered by cluster name (empty to unfilter) func ReadBlockedRecoveries(clusterName string) ([]BlockedTopologyRecovery, error) { res := []BlockedTopologyRecovery{} whereClause := "" args := sqlutils.Args() if clusterName != "" { whereClause = `where cluster_name = ?` args = append(args, clusterName) } query := fmt.Sprintf(` select hostname, port, cluster_name, analysis, last_blocked_timestamp, blocking_recovery_id from blocked_topology_recovery %s order by last_blocked_timestamp desc `, whereClause) err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { blockedTopologyRecovery := BlockedTopologyRecovery{} blockedTopologyRecovery.FailedInstanceKey.Hostname = m.GetString("hostname") blockedTopologyRecovery.FailedInstanceKey.Port = m.GetInt("port") blockedTopologyRecovery.ClusterName = m.GetString("cluster_name") blockedTopologyRecovery.Analysis = inst.AnalysisCode(m.GetString("analysis")) blockedTopologyRecovery.LastBlockedTimestamp = m.GetString("last_blocked_timestamp") blockedTopologyRecovery.BlockingRecoveryId = m.GetInt64("blocking_recovery_id") res = append(res, blockedTopologyRecovery) return nil }) if err != nil { log.Errore(err) } return res, err }
// ReadClusterNameByAlias func ReadClusterNameByAlias(alias string) (clusterName string, err error) { query := ` select cluster_name from cluster_alias where alias = ? or cluster_name = ? ` err = db.QueryOrchestrator(query, sqlutils.Args(alias, alias), func(m sqlutils.RowMap) error { clusterName = m.GetString("cluster_name") return nil }) if err != nil { return "", err } if clusterName == "" { err = fmt.Errorf("No cluster found for alias %s", alias) } return clusterName, err }
// AcquireAccessToken attempts to acquire a hopefully free token; returning in such case // the secretToken as proof of ownership. func AcquireAccessToken(publicToken string) (secretToken string, err error) { secretToken = "" sqlResult, err := db.ExecOrchestrator(` update access_token set is_acquired=1, acquired_at=now() where public_token=? and ( ( is_acquired=0 and generated_at > now() - interval ? second ) or is_reentrant=1 ) `, publicToken, config.Config.AccessTokenUseExpirySeconds, ) if err != nil { return secretToken, log.Errore(err) } rows, err := sqlResult.RowsAffected() if err != nil { return secretToken, log.Errore(err) } if rows == 0 { return secretToken, log.Errorf("Cannot acquire token %s", publicToken) } // Seems like we made it! query := ` select secret_token from access_token where public_token=? ` err = db.QueryOrchestrator(query, sqlutils.Args(publicToken), func(m sqlutils.RowMap) error { secretToken = m.GetString("secret_token") return nil }) return secretToken, log.Errore(err) }
// ReadClusterDomainName reads the domain name associated with a cluster, if any func ReadClusterDomainName(clusterName string) (string, error) { domainName := "" query := ` select domain_name from cluster_domain_name where cluster_name = ? ` err := db.QueryOrchestrator(query, sqlutils.Args(clusterName), func(m sqlutils.RowMap) error { domainName = m.GetString("domain_name") return nil }) if err != nil { return "", err } if domainName == "" { err = fmt.Errorf("No domain name found for cluster %s", clusterName) } return domainName, err }
// ReadResolvedHostname returns the resolved hostname given a hostname, or empty if not exists func ReadResolvedHostname(hostname string) (string, error) { var resolvedHostname string = "" query := ` select resolved_hostname from hostname_resolve where hostname = ? ` err := db.QueryOrchestrator(query, sqlutils.Args(hostname), func(m sqlutils.RowMap) error { resolvedHostname = m.GetString("resolved_hostname") return nil }) readResolvedHostnameCounter.Inc(1) if err != nil { log.Errore(err) } return resolvedHostname, err }
// IsRecoveryDisabled returns true if Recoveries are disabled globally func IsRecoveryDisabled() (bool, error) { var ( disabled bool // default is false! err error ) query := ` SELECT COUNT(*) as mycount FROM global_recovery_disable WHERE disable_recovery=? ` err = db.QueryOrchestrator(query, sqlutils.Args(1), func(m sqlutils.RowMap) error { mycount := m.GetInt("mycount") disabled = (mycount > 0) return nil }) if err != nil { err = log.Errorf("recovery.IsRecoveryDisabled(): %v", err) } return disabled, err }
// readSeeds reads seed from the backend table func readSeeds(whereCondition string, args []interface{}, limit string) ([]SeedOperation, error) { res := []SeedOperation{} query := fmt.Sprintf(` select agent_seed_id, target_hostname, source_hostname, start_timestamp, end_timestamp, is_complete, is_successful from agent_seed %s order by agent_seed_id desc %s `, whereCondition, limit) err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { seedOperation := SeedOperation{} seedOperation.SeedId = m.GetInt64("agent_seed_id") seedOperation.TargetHostname = m.GetString("target_hostname") seedOperation.SourceHostname = m.GetString("source_hostname") seedOperation.StartTimestamp = m.GetString("start_timestamp") seedOperation.EndTimestamp = m.GetString("end_timestamp") seedOperation.IsComplete = m.GetBool("is_complete") seedOperation.IsSuccessful = m.GetBool("is_successful") res = append(res, seedOperation) return nil }) if err != nil { log.Errore(err) } return res, err }
// GetReplicationAnalysis will check for replication problems (dead master; unreachable master; etc) func GetReplicationAnalysis(clusterName string, includeDowntimed bool, auditAnalysis bool) ([]ReplicationAnalysis, error) { result := []ReplicationAnalysis{} args := sqlutils.Args(config.Config.InstancePollSeconds, clusterName) analysisQueryReductionClause := `` if config.Config.ReduceReplicationAnalysisCount { analysisQueryReductionClause = ` HAVING (MIN( master_instance.last_checked <= master_instance.last_seen AND master_instance.last_attempted_check <= master_instance.last_seen + INTERVAL (2 * ?) SECOND ) IS TRUE /* AS is_last_check_valid */) = 0 OR (IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen AND slave_instance.slave_io_running = 0 AND slave_instance.last_io_error RLIKE 'error (connecting|reconnecting) to master' AND slave_instance.slave_sql_running = 1), 0) /* AS count_slaves_failing_to_connect_to_master */ > 0) OR (IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen), 0) /* AS count_valid_slaves */ < COUNT(slave_instance.server_id) /* AS count_slaves */) OR (IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen AND slave_instance.slave_io_running != 0 AND slave_instance.slave_sql_running != 0), 0) /* AS count_valid_replicating_slaves */ < COUNT(slave_instance.server_id) /* AS count_slaves */) OR (MIN( master_instance.slave_sql_running = 1 AND master_instance.slave_io_running = 0 AND master_instance.last_io_error RLIKE 'error (connecting|reconnecting) to master' ) /* AS is_failing_to_connect_to_master */) OR (COUNT(slave_instance.server_id) /* AS count_slaves */ > 0) ` args = append(args, config.Config.InstancePollSeconds) } // "OR count_slaves > 0" above is a recent addition, which, granted, makes some previous conditions redundant. // It gives more output, and more "NoProblem" messages that I am now interested in for purpose of auditing in database_instance_analysis_changelog query := fmt.Sprintf(` SELECT master_instance.hostname, master_instance.port, MIN(master_instance.master_host) AS master_host, MIN(master_instance.master_port) AS master_port, MIN(master_instance.cluster_name) AS cluster_name, MIN(IFNULL(cluster_alias.alias, master_instance.cluster_name)) AS cluster_alias, MIN( master_instance.last_checked <= master_instance.last_seen AND master_instance.last_attempted_check <= master_instance.last_seen + INTERVAL (2 * ?) SECOND ) IS TRUE AS is_last_check_valid, MIN(master_instance.master_host IN ('' , '_') OR master_instance.master_port = 0) AS is_master, MIN(master_instance.is_co_master) AS is_co_master, MIN(CONCAT(master_instance.hostname, ':', master_instance.port) = master_instance.cluster_name) AS is_cluster_master, COUNT(slave_instance.server_id) AS count_slaves, IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen), 0) AS count_valid_slaves, IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen AND slave_instance.slave_io_running != 0 AND slave_instance.slave_sql_running != 0), 0) AS count_valid_replicating_slaves, IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen AND slave_instance.slave_io_running = 0 AND slave_instance.last_io_error RLIKE 'error (connecting|reconnecting) to master' AND slave_instance.slave_sql_running = 1), 0) AS count_slaves_failing_to_connect_to_master, MIN(master_instance.replication_depth) AS replication_depth, GROUP_CONCAT(slave_instance.Hostname, ':', slave_instance.Port) as slave_hosts, MIN( master_instance.slave_sql_running = 1 AND master_instance.slave_io_running = 0 AND master_instance.last_io_error RLIKE 'error (connecting|reconnecting) to master' ) AS is_failing_to_connect_to_master, MIN( database_instance_downtime.downtime_active IS NULL OR database_instance_downtime.end_timestamp < NOW() ) IS FALSE AS is_downtimed, MIN( IFNULL(database_instance_downtime.end_timestamp, '') ) AS downtime_end_timestamp, MIN( IFNULL(TIMESTAMPDIFF(SECOND, NOW(), database_instance_downtime.end_timestamp), 0) ) AS downtime_remaining_seconds, MIN( master_instance.binlog_server ) AS is_binlog_server, MIN( master_instance.pseudo_gtid ) AS is_pseudo_gtid, MIN( master_instance.supports_oracle_gtid ) AS supports_oracle_gtid, SUM( slave_instance.oracle_gtid ) AS count_oracle_gtid_slaves, IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen AND slave_instance.oracle_gtid != 0), 0) AS count_valid_oracle_gtid_slaves, SUM( slave_instance.binlog_server ) AS count_binlog_server_slaves, IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen AND slave_instance.binlog_server != 0), 0) AS count_valid_binlog_server_slaves, MIN( master_instance.mariadb_gtid ) AS is_mariadb_gtid, SUM( slave_instance.mariadb_gtid ) AS count_mariadb_gtid_slaves, IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen AND slave_instance.mariadb_gtid != 0), 0) AS count_valid_mariadb_gtid_slaves FROM database_instance master_instance LEFT JOIN hostname_resolve ON (master_instance.hostname = hostname_resolve.hostname) LEFT JOIN database_instance slave_instance ON (COALESCE(hostname_resolve.resolved_hostname, master_instance.hostname) = slave_instance.master_host AND master_instance.port = slave_instance.master_port) LEFT JOIN database_instance_maintenance ON (master_instance.hostname = database_instance_maintenance.hostname AND master_instance.port = database_instance_maintenance.port AND database_instance_maintenance.maintenance_active = 1) LEFT JOIN database_instance_downtime ON (master_instance.hostname = database_instance_downtime.hostname AND master_instance.port = database_instance_downtime.port AND database_instance_downtime.downtime_active = 1) LEFT JOIN cluster_alias ON (cluster_alias.cluster_name = master_instance.cluster_name) WHERE database_instance_maintenance.database_instance_maintenance_id IS NULL AND ? IN ('', master_instance.cluster_name) GROUP BY master_instance.hostname, master_instance.port %s ORDER BY is_master DESC , is_cluster_master DESC, count_slaves DESC `, analysisQueryReductionClause) err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { a := ReplicationAnalysis{Analysis: NoProblem} a.IsMaster = m.GetBool("is_master") a.IsCoMaster = m.GetBool("is_co_master") a.AnalyzedInstanceKey = InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")} a.AnalyzedInstanceMasterKey = InstanceKey{Hostname: m.GetString("master_host"), Port: m.GetInt("master_port")} a.ClusterDetails.ClusterName = m.GetString("cluster_name") a.ClusterDetails.ClusterAlias = m.GetString("cluster_alias") a.LastCheckValid = m.GetBool("is_last_check_valid") a.CountSlaves = m.GetUint("count_slaves") a.CountValidSlaves = m.GetUint("count_valid_slaves") a.CountValidReplicatingSlaves = m.GetUint("count_valid_replicating_slaves") a.CountSlavesFailingToConnectToMaster = m.GetUint("count_slaves_failing_to_connect_to_master") a.ReplicationDepth = m.GetUint("replication_depth") a.IsFailingToConnectToMaster = m.GetBool("is_failing_to_connect_to_master") a.IsDowntimed = m.GetBool("is_downtimed") a.DowntimeEndTimestamp = m.GetString("downtime_end_timestamp") a.DowntimeRemainingSeconds = m.GetInt("downtime_remaining_seconds") a.IsBinlogServer = m.GetBool("is_binlog_server") a.ClusterDetails.ReadRecoveryInfo() a.SlaveHosts = *NewInstanceKeyMap() a.SlaveHosts.ReadCommaDelimitedList(m.GetString("slave_hosts")) countValidOracleGTIDSlaves := m.GetUint("count_valid_oracle_gtid_slaves") a.OracleGTIDImmediateTopology = countValidOracleGTIDSlaves == a.CountValidSlaves && a.CountValidSlaves > 0 countValidMariaDBGTIDSlaves := m.GetUint("count_valid_mariadb_gtid_slaves") a.MariaDBGTIDImmediateTopology = countValidMariaDBGTIDSlaves == a.CountValidSlaves && a.CountValidSlaves > 0 countValidBinlogServerSlaves := m.GetUint("count_valid_binlog_server_slaves") a.BinlogServerImmediateTopology = countValidBinlogServerSlaves == a.CountValidSlaves && a.CountValidSlaves > 0 a.PseudoGTIDImmediateTopology = m.GetBool("is_pseudo_gtid") if a.IsMaster && !a.LastCheckValid && a.CountSlaves == 0 { a.Analysis = DeadMasterWithoutSlaves a.Description = "Master cannot be reached by orchestrator and has no slave" // } else if a.IsMaster && !a.LastCheckValid && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 { a.Analysis = DeadMaster a.Description = "Master cannot be reached by orchestrator and none of its slaves is replicating" // } else if a.IsMaster && !a.LastCheckValid && a.CountSlaves > 0 && a.CountValidSlaves == 0 && a.CountValidReplicatingSlaves == 0 { a.Analysis = DeadMasterAndSlaves a.Description = "Master cannot be reached by orchestrator and none of its slaves is replicating" // } else if a.IsMaster && !a.LastCheckValid && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 { a.Analysis = DeadMasterAndSomeSlaves a.Description = "Master cannot be reached by orchestrator; some of its slaves are unreachable and none of its reachable slaves is replicating" // } else if a.IsMaster && !a.LastCheckValid && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves > 0 { a.Analysis = UnreachableMaster a.Description = "Master cannot be reached by orchestrator but it has replicating slaves; possibly a network/host issue" // } else if a.IsMaster && a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 { a.Analysis = MasterSingleSlaveNotReplicating a.Description = "Master is reachable but its single slave is not replicating" // } else if a.IsMaster && a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == 0 { a.Analysis = MasterSingleSlaveDead a.Description = "Master is reachable but its single slave is dead" // } else if a.IsMaster && a.LastCheckValid && a.CountSlaves > 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 { a.Analysis = AllMasterSlavesNotReplicating a.Description = "Master is reachable but none of its slaves is replicating" // } else if a.IsMaster && a.LastCheckValid && a.CountSlaves > 1 && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 { a.Analysis = AllMasterSlavesNotReplicatingOrDead a.Description = "Master is reachable but none of its slaves is replicating" // } else /* co-master */ if a.IsCoMaster && !a.LastCheckValid && a.CountSlaves > 0 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 { a.Analysis = DeadCoMaster a.Description = "Co-master cannot be reached by orchestrator and none of its slaves is replicating" // } else if a.IsCoMaster && !a.LastCheckValid && a.CountSlaves > 0 && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 { a.Analysis = DeadCoMasterAndSomeSlaves a.Description = "Co-master cannot be reached by orchestrator; some of its slaves are unreachable and none of its reachable slaves is replicating" // } else if a.IsCoMaster && !a.LastCheckValid && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves > 0 { a.Analysis = UnreachableCoMaster a.Description = "Co-master cannot be reached by orchestrator but it has replicating slaves; possibly a network/host issue" // } else if a.IsCoMaster && a.LastCheckValid && a.CountSlaves > 0 && a.CountValidReplicatingSlaves == 0 { a.Analysis = AllCoMasterSlavesNotReplicating a.Description = "Co-master is reachable but none of its slaves is replicating" // } else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == a.CountSlaves && a.CountSlavesFailingToConnectToMaster == a.CountSlaves && a.CountValidReplicatingSlaves == 0 { a.Analysis = DeadIntermediateMasterWithSingleSlaveFailingToConnect a.Description = "Intermediate master cannot be reached by orchestrator and its (single) slave is failing to connect" // } else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 { a.Analysis = DeadIntermediateMasterWithSingleSlave a.Description = "Intermediate master cannot be reached by orchestrator and its (single) slave is not replicating" // } else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountSlaves > 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 { a.Analysis = DeadIntermediateMaster a.Description = "Intermediate master cannot be reached by orchestrator and none of its slaves is replicating" // } else if !a.IsMaster && !a.LastCheckValid && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 { a.Analysis = DeadIntermediateMasterAndSomeSlaves a.Description = "Intermediate master cannot be reached by orchestrator; some of its slaves are unreachable and none of its reachable slaves is replicating" // } else if !a.IsMaster && !a.LastCheckValid && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves > 0 { a.Analysis = UnreachableIntermediateMaster a.Description = "Intermediate master cannot be reached by orchestrator but it has replicating slaves; possibly a network/host issue" // } else if !a.IsMaster && a.LastCheckValid && a.CountSlaves > 1 && a.CountValidReplicatingSlaves == 0 && a.CountSlavesFailingToConnectToMaster > 0 && a.CountSlavesFailingToConnectToMaster == a.CountValidSlaves { // All slaves are either failing to connect to master (and at least one of these have to exist) // or completely dead. // Must have at least two slaves to reach such conclusion -- do note that the intermediate master is still // reachable to orchestrator, so we base our conclusion on slaves only at this point. a.Analysis = AllIntermediateMasterSlavesFailingToConnectOrDead a.Description = "Intermediate master is reachable but all of its slaves are failing to connect" // } else if !a.IsMaster && a.LastCheckValid && a.CountSlaves > 0 && a.CountValidReplicatingSlaves == 0 { a.Analysis = AllIntermediateMasterSlavesNotReplicating a.Description = "Intermediate master is reachable but none of its slaves is replicating" // } else if a.IsBinlogServer && a.IsFailingToConnectToMaster { a.Analysis = BinlogServerFailingToConnectToMaster a.Description = "Binlog server is unable to connect to its master" // } else if a.ReplicationDepth == 1 && a.IsFailingToConnectToMaster { a.Analysis = FirstTierSlaveFailingToConnectToMaster a.Description = "1st tier slave (directly replicating from topology master) is unable to connect to the master" // } // else if a.IsMaster && a.CountSlaves == 0 { // a.Analysis = MasterWithoutSlaves // a.Description = "Master has no slaves" // } if a.Analysis != NoProblem { skipThisHost := false for _, filter := range config.Config.RecoveryIgnoreHostnameFilters { if matched, _ := regexp.MatchString(filter, a.AnalyzedInstanceKey.Hostname); matched { skipThisHost = true } } if a.IsDowntimed && !includeDowntimed { skipThisHost = true } if !skipThisHost { result = append(result, a) } } if a.CountSlaves > 0 && auditAnalysis { // Interesting enough for analysis go auditInstanceAnalysisInChangelog(&a.AnalyzedInstanceKey, a.Analysis) } return nil }) if err != nil { log.Errore(err) } return result, err }
// readRecoveries reads recovery entry/audit entires from topology_recovery func readRecoveries(whereCondition string, limit string, args []interface{}) ([]TopologyRecovery, error) { res := []TopologyRecovery{} query := fmt.Sprintf(` select recovery_id, hostname, port, (IFNULL(end_active_period_unixtime, 0) = 0) as is_active, start_active_period, IFNULL(end_active_period_unixtime, 0) as end_active_period_unixtime, IFNULL(end_recovery, '') AS end_recovery, is_successful, processing_node_hostname, processcing_node_token, ifnull(successor_hostname, '') as successor_hostname, ifnull(successor_port, 0) as successor_port, ifnull(successor_alias, '') as successor_alias, analysis, cluster_name, cluster_alias, count_affected_slaves, slave_hosts, participating_instances, lost_slaves, all_errors, acknowledged, acknowledged_at, acknowledged_by, acknowledge_comment, last_detection_id from topology_recovery %s order by recovery_id desc %s `, whereCondition, limit) err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { topologyRecovery := *NewTopologyRecovery(inst.ReplicationAnalysis{}) topologyRecovery.Id = m.GetInt64("recovery_id") topologyRecovery.IsActive = m.GetBool("is_active") topologyRecovery.RecoveryStartTimestamp = m.GetString("start_active_period") topologyRecovery.RecoveryEndTimestamp = m.GetString("end_recovery") topologyRecovery.IsSuccessful = m.GetBool("is_successful") topologyRecovery.ProcessingNodeHostname = m.GetString("processing_node_hostname") topologyRecovery.ProcessingNodeToken = m.GetString("processcing_node_token") topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname") topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port") topologyRecovery.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis")) topologyRecovery.AnalysisEntry.ClusterDetails.ClusterName = m.GetString("cluster_name") topologyRecovery.AnalysisEntry.ClusterDetails.ClusterAlias = m.GetString("cluster_alias") topologyRecovery.AnalysisEntry.CountSlaves = m.GetUint("count_affected_slaves") topologyRecovery.AnalysisEntry.ReadSlaveHostsFromString(m.GetString("slave_hosts")) topologyRecovery.SuccessorKey = &inst.InstanceKey{} topologyRecovery.SuccessorKey.Hostname = m.GetString("successor_hostname") topologyRecovery.SuccessorKey.Port = m.GetInt("successor_port") topologyRecovery.SuccessorAlias = m.GetString("successor_alias") topologyRecovery.AnalysisEntry.ClusterDetails.ReadRecoveryInfo() topologyRecovery.AllErrors = strings.Split(m.GetString("all_errors"), "\n") topologyRecovery.LostSlaves.ReadCommaDelimitedList(m.GetString("lost_slaves")) topologyRecovery.ParticipatingInstanceKeys.ReadCommaDelimitedList(m.GetString("participating_instances")) topologyRecovery.Acknowledged = m.GetBool("acknowledged") topologyRecovery.AcknowledgedAt = m.GetString("acknowledged_at") topologyRecovery.AcknowledgedBy = m.GetString("acknowledged_by") topologyRecovery.AcknowledgedComment = m.GetString("acknowledge_comment") topologyRecovery.LastDetectionId = m.GetInt64("last_detection_id") res = append(res, topologyRecovery) return nil }) if err != nil { log.Errore(err) } return res, err }