// readMissingHostnamesToResolve gets those (unresolved, e.g. VIP) hostnames that *should* be present in // the hostname_resolve table, but aren't. func readMissingKeysToResolve() (result InstanceKeyMap, err error) { query := ` select hostname_unresolve.unresolved_hostname, database_instance.port from database_instance join hostname_unresolve on (database_instance.hostname = hostname_unresolve.hostname) left join hostname_resolve on (database_instance.hostname = hostname_resolve.resolved_hostname) where hostname_resolve.hostname is null ` db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { instanceKey := InstanceKey{Hostname: m.GetString("unresolved_hostname"), Port: m.GetInt("port")} result.AddKey(instanceKey) return nil }) Cleanup: if err != nil { log.Errore(err) } return result, err }
// ReadOutdatedAgentsHosts returns agents that need to be updated func ReadOutdatedAgentsHosts() ([]string, error) { res := []string{} query := fmt.Sprintf(` select hostname from host_agent where IFNULL(last_checked < now() - interval %d minute, true) `, config.Config.AgentPollMinutes) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { hostname := m.GetString("hostname") res = append(res, hostname) return nil }) Cleanup: if err != nil { log.Errore(err) } return res, err }
// readAgentBasicInfo returns the basic data for an agent directly from backend table (no agent access) func readAgentBasicInfo(hostname string) (Agent, string, error) { agent := Agent{} token := "" query := fmt.Sprintf(` select hostname, port, token, last_submitted, mysql_port from host_agent where hostname = '%s' `, hostname) db, err := db.OpenOrchestrator() if err != nil { return agent, "", err } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { agent.Hostname = m.GetString("hostname") agent.Port = m.GetInt("port") agent.LastSubmitted = m.GetString("last_submitted") agent.MySQLPort = m.GetInt64("mysql_port") token = m.GetString("token") return nil }) if token == "" { return agent, "", log.Errorf("Cannot get agent/token: %s", hostname) } return agent, token, nil }
// ReadClusterAliases reads the entrie cluster name aliases mapping func ReadClusterByAlias(alias string) (string, error) { clusterName := "" query := fmt.Sprintf(` select cluster_name from cluster_alias where alias = '%s' `, alias) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { clusterName = m.GetString("cluster_name") return nil }) Cleanup: if err != nil { return "", err } if clusterName == "" { err = fmt.Errorf("No cluster found for alias %s", alias) } return clusterName, err }
// EndMaintenance will terminate an active maintenance via maintenanceToken func EndMaintenance(maintenanceToken int64) error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } res, err := sqlutils.Exec(db, ` update database_instance_maintenance set maintenance_active = NULL, end_timestamp = NOW() where database_instance_maintenance_id = ? `, maintenanceToken, ) if err != nil { return log.Errore(err) } if affected, _ := res.RowsAffected(); affected == 0 { err = fmt.Errorf("Instance is not in maintenance mode; token = %+v", maintenanceToken) } else { // success instanceKey, _ := ReadMaintenanceInstanceKey(maintenanceToken) AuditOperation("end-maintenance", instanceKey, fmt.Sprintf("maintenanceToken: %d", maintenanceToken)) } return err }
// ReadMaintenanceInstanceKey will return the instanceKey for active maintenance by maintenanceToken func ReadMaintenanceInstanceKey(maintenanceToken int64) (*InstanceKey, error) { var res *InstanceKey query := fmt.Sprintf(` select hostname, port from database_instance_maintenance where database_instance_maintenance_id = %d `, maintenanceToken) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { instanceKey, merr := NewInstanceKeyFromStrings(m.GetString("hostname"), m.GetString("port")) if merr != nil { return merr } res = instanceKey return nil }) Cleanup: if err != nil { log.Errore(err) } return res, err }
// writePoolInstances will write (and override) a single cluster name mapping func writePoolInstances(pool string, instanceKeys [](*InstanceKey)) error { writeFunc := func() error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } tx, err := db.Begin() stmt, err := tx.Prepare(`delete from database_instance_pool where pool = ?`) _, err = stmt.Exec(pool) if err != nil { tx.Rollback() return log.Errore(err) } stmt, err = tx.Prepare(`insert into database_instance_pool values (?, ?, ?)`) for _, instanceKey := range instanceKeys { _, err := stmt.Exec(instanceKey.Hostname, instanceKey.Port, pool) if err != nil { tx.Rollback() return log.Errore(err) } } if err != nil { tx.Rollback() return log.Errore(err) } tx.Commit() return nil } return ExecDBWriteFunc(writeFunc) }
// IsElected checks whether this node is the elected active node func IsElected() (bool, error) { isElected := false query := fmt.Sprintf(` select count(*) as is_elected from active_node where anchor = 1 and hostname = '%s' and token = '%s' `, ThisHostname, ProcessToken.Hash) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { isElected = m.GetBool("is_elected") return nil }) Cleanup: if err != nil { log.Errore(err) } return isElected, err }
// ElectedNode returns the hostname of the elected node func ElectedNode() (string, string, bool, error) { hostname := "" token := "" isElected := false query := fmt.Sprintf(` select ifnull(max(hostname), '') as hostname, ifnull(max(token), '') as token, (ifnull(max(hostname), '') = '%s') and (ifnull(max(token), '') = '%s') as is_elected from active_node where anchor = 1 `, ThisHostname, ProcessToken.Hash) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { hostname = m.GetString("hostname") token = m.GetString("token") isElected = m.GetBool("is_elected") return nil }) Cleanup: if err != nil { log.Errore(err) } return hostname, token, isElected, err }
func GetEquivalentMasterCoordinates(instanceCoordinates *InstanceBinlogCoordinates) (result [](*InstanceBinlogCoordinates), err error) { query := fmt.Sprintf(` select master1_hostname as hostname, master1_port as port, master1_binary_log_file as binlog_file, master1_binary_log_pos as binlog_pos from master_position_equivalence where master2_hostname = '%s' and master2_port = '%d' and master2_binary_log_file = '%s' and master2_binary_log_pos = '%d' union select master2_hostname as hostname, master2_port as port, master2_binary_log_file as binlog_file, master2_binary_log_pos as binlog_pos from master_position_equivalence where master1_hostname = '%s' and master1_port = '%d' and master1_binary_log_file = '%s' and master1_binary_log_pos = '%d' `, instanceCoordinates.Key.Hostname, instanceCoordinates.Key.Port, instanceCoordinates.Coordinates.LogFile, instanceCoordinates.Coordinates.LogPos, instanceCoordinates.Key.Hostname, instanceCoordinates.Key.Port, instanceCoordinates.Coordinates.LogFile, instanceCoordinates.Coordinates.LogPos) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { equivalentCoordinates := InstanceBinlogCoordinates{} equivalentCoordinates.Key.Hostname = m.GetString("hostname") equivalentCoordinates.Key.Port = m.GetInt("port") equivalentCoordinates.Coordinates.LogFile = m.GetString("binlog_file") equivalentCoordinates.Coordinates.LogPos = m.GetInt64("binlog_pos") result = append(result, &equivalentCoordinates) return nil }) Cleanup: if err != nil { return nil, err } return result, nil }
// EndDowntime will remove downtime flag from an instance func EndDowntime(instanceKey *InstanceKey) error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } res, err := sqlutils.Exec(db, ` update database_instance_downtime set downtime_active = NULL, end_timestamp = NOW() where hostname = ? and port = ? and downtime_active = 1 `, instanceKey.Hostname, instanceKey.Port, ) if err != nil { return log.Errore(err) } if affected, _ := res.RowsAffected(); affected == 0 { err = fmt.Errorf("Instance is not in downtime mode: %+v", instanceKey) } else { // success AuditOperation("end-downtime", instanceKey, "") } return err }
// WriteClusterDomainName will write (and override) the domain name of a cluster func WriteClusterDomainName(clusterName string, domainName string) error { writeFunc := func() error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } _, err = sqlutils.Exec(db, ` insert into cluster_domain_name (cluster_name, domain_name, last_registered) values (?, ?, NOW()) on duplicate key update domain_name=values(domain_name), last_registered=values(last_registered) `, clusterName, domainName) if err != nil { return log.Errore(err) } return nil } return ExecDBWriteFunc(writeFunc) }
// ReadClusterDomainName reads the domain name associated with a cluster, if any func ReadClusterDomainName(clusterName string) (string, error) { domainName := "" query := fmt.Sprintf(` select domain_name from cluster_domain_name where cluster_name = '%s' `, clusterName) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { domainName = m.GetString("domain_name") return nil }) Cleanup: if err != nil { return "", err } if domainName == "" { err = fmt.Errorf("No domain name found for cluster %s", clusterName) } return domainName, err }
// ElectedNode returns the details of the elected node, as well as answering the question "is this process the elected one"? func ElectedNode() (hostname string, token string, isElected bool, err error) { query := ` select hostname, token from active_node where anchor = 1 ` db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { hostname = m.GetString("hostname") token = m.GetString("token") return nil }) Cleanup: if err != nil { log.Errore(err) } isElected = (hostname == ThisHostname && token == ProcessToken.Hash) return hostname, token, isElected, err }
// submitSeedStateEntry submits a seed state: a single step in the overall seed process func submitSeedStateEntry(seedId int64, action string, errorMessage string) (int64, error) { db, err := db.OpenOrchestrator() if err != nil { return 0, log.Errore(err) } res, err := sqlutils.Exec(db, ` insert into agent_seed_state ( agent_seed_id, state_timestamp, state_action, error_message ) VALUES ( ?, NOW(), ?, ? ) `, seedId, action, errorMessage, ) if err != nil { return 0, log.Errore(err) } id, err := res.LastInsertId() return id, err }
// AttemptElection tries to grab leadership (become active node) func AttemptElection() (bool, error) { db, err := db.OpenOrchestrator() if err != nil { return false, log.Errore(err) } sqlResult, err := sqlutils.Exec(db, ` update active_node set hostname = ?, token = ?, last_seen_active = now() where anchor = 1 and ( last_seen_active < now() - interval ? second or hostname = '' or (hostname = ? and token = ?) ) `, ThisHostname, ProcessToken.Hash, config.Config.ActiveNodeExpireSeconds, ThisHostname, ProcessToken.Hash, ) if err != nil { return false, log.Errore(err) } rows, err := sqlResult.RowsAffected() return (rows > 0), err }
// FailStaleSeeds marks as failed seeds where no progress have been seen recently func FailStaleSeeds() error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } _, err = sqlutils.Exec(db, ` update agent_seed set is_complete=1, is_successful=0 where is_complete=0 and ( select max(state_timestamp) as last_state_timestamp from agent_seed_state where agent_seed.agent_seed_id = agent_seed_state.agent_seed_id ) < now() - interval ? minute`, config.Config.StaleSeedFailMinutes, ) return err }
// ReadResolvedHostname returns the resolved hostname given a hostname, or empty if not exists func ReadResolvedHostname(hostname string) (string, error) { var resolvedHostname string = "" query := fmt.Sprintf(` select resolved_hostname from hostname_resolve where hostname = '%s' `, hostname) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { resolvedHostname = m.GetString("resolved_hostname") return nil }) Cleanup: if err != nil { log.Errore(err) } return resolvedHostname, err }
func readAvailableNodes() ([]string, error) { res := []string{} query := fmt.Sprintf(` select concat(hostname, ';', token) as node from node_health where last_seen_active > now() - interval 5 minute order by hostname `) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { res = append(res, m.GetString("node")) return nil }) Cleanup: if err != nil { log.Errore(err) } return res, err }
func readAllHostnameResolves() ([]HostnameResolve, error) { res := []HostnameResolve{} query := fmt.Sprintf(` select hostname, resolved_hostname from hostname_resolve `) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { hostnameResolve := HostnameResolve{hostname: m.GetString("hostname"), resolvedHostname: m.GetString("resolved_hostname")} res = append(res, hostnameResolve) return nil }) Cleanup: if err != nil { log.Errore(err) } return res, err }
// ReadClusterAliases reads the entrie cluster name aliases mapping func ReadClusterAliases() error { updatedMap := make(map[string]string) query := fmt.Sprintf(` select cluster_name, alias from cluster_alias `) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { updatedMap[m.GetString("cluster_name")] = m.GetString("alias") return err }) Cleanup: if err != nil { log.Errore(err) } clusterAliasMapMutex.Lock() defer clusterAliasMapMutex.Unlock() clusterAliasMap = updatedMap return err }
// UpdateAgentInfo updates some agent state in backend table func UpdateAgentInfo(hostname string, agent Agent) error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } _, err = sqlutils.Exec(db, ` update host_agent set last_seen = NOW(), mysql_port = ?, count_mysql_snapshots = ? where hostname = ?`, agent.MySQLPort, len(agent.LogicalVolumes), hostname, ) if err != nil { return log.Errore(err) } return nil }
// WriteLongRunningProcesses rewrites current state of long running processes for given instance func WriteLongRunningProcesses(instanceKey *InstanceKey, processes []Process) error { writeFunc := func() error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } _, err = sqlutils.Exec(db, ` delete from database_instance_long_running_queries where hostname = ? and port = ? `, instanceKey.Hostname, instanceKey.Port) if err != nil { return log.Errore(err) } for _, process := range processes { _, merr := sqlutils.Exec(db, ` insert into database_instance_long_running_queries ( hostname, port, process_id, process_started_at, process_user, process_host, process_db, process_command, process_time_seconds, process_state, process_info ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, instanceKey.Hostname, instanceKey.Port, process.Id, process.StartedAt, process.User, process.Host, process.Db, process.Command, process.Time, process.State, process.Info, ) if merr != nil { err = merr } } if err != nil { return log.Errore(err) } return nil } return ExecDBWriteFunc(writeFunc) }
// readRecoveries reads recovery entry/audit entires from topology_recovery func readFailureDetections(whereCondition string, limit string) ([]TopologyRecovery, error) { res := []TopologyRecovery{} query := fmt.Sprintf(` select detection_id, hostname, port, in_active_period as is_active, start_active_period, end_active_period_unixtime, processing_node_hostname, processcing_node_token, analysis, cluster_name, cluster_alias, count_affected_slaves, slave_hosts from topology_failure_detection %s order by detection_id desc %s `, whereCondition, limit) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { failureDetection := TopologyRecovery{} failureDetection.Id = m.GetInt64("detection_id") failureDetection.IsActive = m.GetBool("is_active") failureDetection.RecoveryStartTimestamp = m.GetString("start_active_period") failureDetection.ProcessingNodeHostname = m.GetString("processing_node_hostname") failureDetection.ProcessingNodeToken = m.GetString("processcing_node_token") failureDetection.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname") failureDetection.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port") failureDetection.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis")) failureDetection.AnalysisEntry.ClusterDetails.ClusterName = m.GetString("cluster_name") failureDetection.AnalysisEntry.ClusterDetails.ClusterAlias = m.GetString("cluster_alias") failureDetection.AnalysisEntry.CountSlaves = m.GetUint("count_affected_slaves") failureDetection.AnalysisEntry.ReadSlaveHostsFromString(m.GetString("slave_hosts")) failureDetection.AnalysisEntry.ClusterDetails.ReadRecoveryInfo() res = append(res, failureDetection) return nil }) Cleanup: if err != nil { log.Errore(err) } return res, err }
// ForgetLongUnseenAgents will remove entries of all agents that have long since been last seen. func ForgetLongUnseenAgents() error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } _, err = sqlutils.Exec(db, ` delete from host_agent where last_submitted < NOW() - interval ? hour`, config.Config.UnseenAgentForgetHours, ) return err }
// ReadRecentAudit returns a list of audit entries order chronologically descending, using page number. func ReadRecentAudit(instanceKey *InstanceKey, page int) ([]Audit, error) { res := []Audit{} var whereCondition string if instanceKey != nil { whereCondition = fmt.Sprintf(`where hostname='%s' and port='%d'`, instanceKey.Hostname, instanceKey.Port) } query := fmt.Sprintf(` select audit_id, audit_timestamp, audit_type, hostname, port, message from audit %s order by audit_timestamp desc limit %d offset %d `, whereCondition, config.Config.AuditPageSize, page*config.Config.AuditPageSize) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { audit := Audit{} audit.AuditId = m.GetInt64("audit_id") audit.AuditTimestamp = m.GetString("audit_timestamp") audit.AuditType = m.GetString("audit_type") audit.AuditInstanceKey.Hostname = m.GetString("hostname") audit.AuditInstanceKey.Port = m.GetInt("port") audit.Message = m.GetString("message") res = append(res, audit) return err }) Cleanup: if err != nil { log.Errore(err) } return res, err }
// Just check to make sure we can connect to the database func SimpleHealthTest() (*HealthStatus, error) { health := HealthStatus{Healthy: false, Hostname: ThisHostname, Token: ProcessToken.Hash} db, err := db.OpenOrchestrator() if err != nil { health.Error = err return &health, log.Errore(err) } if err = db.Ping(); err != nil { health.Error = err return &health, log.Errore(err) } else { health.Healthy = true return &health, nil } }
// ReadActiveMaintenance returns the list of currently active maintenance entries func ReadActiveMaintenance() ([]Maintenance, error) { res := []Maintenance{} query := fmt.Sprintf(` select database_instance_maintenance_id, hostname, port, begin_timestamp, timestampdiff(second, begin_timestamp, now()) as seconds_elapsed, maintenance_active, owner, reason from database_instance_maintenance where maintenance_active = 1 order by database_instance_maintenance_id `) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { maintenance := Maintenance{} maintenance.MaintenanceId = m.GetUint("database_instance_maintenance_id") maintenance.Key.Hostname = m.GetString("hostname") maintenance.Key.Port = m.GetInt("port") maintenance.BeginTimestamp = m.GetString("begin_timestamp") maintenance.SecondsElapsed = m.GetUint("seconds_elapsed") maintenance.IsActive = m.GetBool("maintenance_active") maintenance.Owner = m.GetString("owner") maintenance.Reason = m.GetString("reason") res = append(res, maintenance) return err }) Cleanup: if err != nil { log.Errore(err) } return res, err }
// AttemptRecoveryRegistration tries to add a recovery entry; if this fails that means recovery is already in place. func AttemptRecoveryRegistration(analysisEntry *inst.ReplicationAnalysis) (bool, error) { db, err := db.OpenOrchestrator() if err != nil { return false, log.Errore(err) } sqlResult, err := sqlutils.Exec(db, ` insert ignore into topology_recovery ( hostname, port, in_active_period, start_active_period, end_active_period_unixtime, processing_node_hostname, processcing_node_token, analysis, cluster_name, cluster_alias, count_affected_slaves, slave_hosts ) values ( ?, ?, 1, NOW(), 0, ?, ?, ?, ?, ?, ?, ? ) `, analysisEntry.AnalyzedInstanceKey.Hostname, analysisEntry.AnalyzedInstanceKey.Port, ThisHostname, ProcessToken.Hash, string(analysisEntry.Analysis), analysisEntry.ClusterDetails.ClusterName, analysisEntry.ClusterDetails.ClusterAlias, analysisEntry.CountSlaves, analysisEntry.GetSlaveHostsAsString(), ) if err != nil { return false, log.Errore(err) } rows, err := sqlResult.RowsAffected() return (err == nil && rows > 0), err }
// ExpireDowntime will remove the maintenance flag on old downtimes func ExpireDowntime() error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } { res, err := sqlutils.Exec(db, ` delete from database_instance_downtime where downtime_active is null and end_timestamp < NOW() - INTERVAL ? DAY `, config.Config.MaintenancePurgeDays, ) if err != nil { return log.Errore(err) } if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { AuditOperation("expire-downtime", nil, fmt.Sprintf("Purged %d historical entries", rowsAffected)) } } { res, err := sqlutils.Exec(db, ` update database_instance_downtime set downtime_active = NULL where downtime_active = 1 and end_timestamp < NOW() `, ) if err != nil { return log.Errore(err) } if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { AuditOperation("expire-downtime", nil, fmt.Sprintf("Expired %d entries", rowsAffected)) } } return err }