// HealthTest attempts to write to the backend database and get a result func HealthTest() (*HealthStatus, error) { health := HealthStatus{Healthy: false, Hostname: ThisHostname, Token: ProcessToken.Hash} sqlResult, err := RegisterNode("", "", false) if err != nil { health.Error = err return &health, log.Errore(err) } rows, err := sqlResult.RowsAffected() if err != nil { health.Error = err return &health, log.Errore(err) } health.Healthy = (rows > 0) activeHostname, activeToken, isActive, err := ElectedNode() if err != nil { health.Error = err return &health, log.Errore(err) } health.ActiveNode = fmt.Sprintf("%s;%s", activeHostname, activeToken) health.IsActiveNode = isActive health.AvailableNodes, err = readAvailableNodes(true) return &health, nil }
// StopSlave stops replication on a given instance func StopSlave(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, fmt.Errorf("instance is not a slave: %+v", instanceKey) } _, err = ExecInstanceNoPrepare(instanceKey, `stop slave`) if err != nil { // Patch; current MaxScale behavior for STOP SLAVE is to throw an error if slave already stopped. if instance.isMaxScale() && err.Error() == "Error 1199: Slave connection is not running" { err = nil } } if err != nil { return instance, log.Errore(err) } instance, err = ReadTopologyInstance(instanceKey) log.Infof("Stopped slave on %+v, Self:%+v, Exec:%+v", *instanceKey, instance.SelfBinlogCoordinates, instance.ExecBinlogCoordinates) return instance, err }
// writePoolInstances will write (and override) a single cluster name mapping func writePoolInstances(pool string, instanceKeys [](*InstanceKey)) error { writeFunc := func() error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } tx, err := db.Begin() stmt, err := tx.Prepare(`delete from database_instance_pool where pool = ?`) _, err = stmt.Exec(pool) if err != nil { tx.Rollback() return log.Errore(err) } stmt, err = tx.Prepare(`insert into database_instance_pool values (?, ?, ?)`) for _, instanceKey := range instanceKeys { _, err := stmt.Exec(instanceKey.Hostname, instanceKey.Port, pool) if err != nil { tx.Rollback() return log.Errore(err) } } if err != nil { tx.Rollback() return log.Errore(err) } tx.Commit() return nil } return ExecDBWriteFunc(writeFunc) }
// SkipQuery skip a single query in a failed replication instance func SkipQuery(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, fmt.Errorf("instance is not a slave: %+v", instanceKey) } if instance.Slave_SQL_Running { return instance, fmt.Errorf("Slave SQL thread is running on %+v", instanceKey) } if instance.LastSQLError == "" { return instance, fmt.Errorf("No SQL error on %+v", instanceKey) } if *config.RuntimeCLIFlags.Noop { return instance, fmt.Errorf("noop: aborting skip-query operation on %+v; signalling error but nothing went wrong.", *instanceKey) } log.Debugf("Skipping one query on %+v", instanceKey) if instance.UsingOracleGTID { err = skipQueryOracleGtid(instance) } else if instance.UsingMariaDBGTID { return instance, log.Errorf("%+v is replicating with MariaDB GTID. To skip a query first disable GTID, then skip, then enable GTID again", *instanceKey) } else { err = skipQueryClassic(instance) } if err != nil { return instance, log.Errore(err) } AuditOperation("skip-query", instanceKey, "Skipped one query") return StartSlave(instanceKey) }
// ResetSlave resets a slave, breaking the replication func ResetSlave(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if instance.SlaveRunning() { return instance, fmt.Errorf("Cannot reset slave on: %+v because slave is running", instanceKey) } if *config.RuntimeCLIFlags.Noop { return instance, fmt.Errorf("noop: aborting reset-slave operation on %+v; signalling error but nothing went wrong.", *instanceKey) } // MySQL's RESET SLAVE is done correctly; however SHOW SLAVE STATUS still returns old hostnames etc // and only resets till after next restart. This leads to orchestrator still thinking the instance replicates // from old host. We therefore forcibly modify the hostname. // RESET SLAVE ALL command solves this, but only as of 5.6.3 _, err = ExecInstanceNoPrepare(instanceKey, `change master to master_host='_'`) if err != nil { return instance, log.Errore(err) } _, err = ExecInstanceNoPrepare(instanceKey, `reset slave /*!50603 all */`) if err != nil { return instance, log.Errore(err) } log.Infof("Reset slave %+v", instanceKey) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// ReattachSlave restores a detached slave back into replication func ReattachSlave(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if instance.SlaveRunning() { return instance, fmt.Errorf("Cannot (need not) reattach slave on: %+v because slave is running", instanceKey) } isDetached, detachedLogFile, detachedLogPos := instance.ExecBinlogCoordinates.DetachedCoordinates() if !isDetached { return instance, fmt.Errorf("Cannot reattach slave on: %+v because slave is not detached", instanceKey) } if *config.RuntimeCLIFlags.Noop { return instance, fmt.Errorf("noop: aborting reattach-slave operation on %+v; signalling error but nothing went wrong.", *instanceKey) } _, err = ExecInstanceNoPrepare(instanceKey, fmt.Sprintf(`change master to master_log_file='%s', master_log_pos=%s`, detachedLogFile, detachedLogPos)) if err != nil { return instance, log.Errore(err) } log.Infof("Reattach slave %+v", instanceKey) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// DetachSlave detaches a slave from replication; forcibly corrupting the binlog coordinates (though in such way // that is reversible) func DetachSlave(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if instance.SlaveRunning() { return instance, fmt.Errorf("Cannot detach slave on: %+v because slave is running", instanceKey) } isDetached, _, _ := instance.ExecBinlogCoordinates.DetachedCoordinates() if isDetached { return instance, fmt.Errorf("Cannot (need not) detach slave on: %+v because slave is already detached", instanceKey) } if *config.RuntimeCLIFlags.Noop { return instance, fmt.Errorf("noop: aborting detach-slave operation on %+v; signalling error but nothing went wrong.", *instanceKey) } detachedCoordinates := BinlogCoordinates{LogFile: fmt.Sprintf("//%s:%d", instance.ExecBinlogCoordinates.LogFile, instance.ExecBinlogCoordinates.LogPos), LogPos: instance.ExecBinlogCoordinates.LogPos} // Encode the current coordinates within the log file name, in such way that replication is broken, but info can still be resurrected _, err = ExecInstanceNoPrepare(instanceKey, fmt.Sprintf(`change master to master_log_file='%s', master_log_pos=%d`, detachedCoordinates.LogFile, detachedCoordinates.LogPos)) if err != nil { return instance, log.Errore(err) } log.Infof("Detach slave %+v", instanceKey) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
func UnresolveHostname(instanceKey *InstanceKey) (InstanceKey, bool, error) { if *config.RuntimeCLIFlags.SkipUnresolve { return *instanceKey, false, nil } unresolvedHostname, err := readUnresolvedHostname(instanceKey.Hostname) if err != nil { return *instanceKey, false, log.Errore(err) } if unresolvedHostname == instanceKey.Hostname { // unchanged. Nothing to do return *instanceKey, false, nil } // We unresovled to a different hostname. We will now re-resolve to double-check! unresolvedKey := &InstanceKey{Hostname: unresolvedHostname, Port: instanceKey.Port} instance, err := ReadTopologyInstance(unresolvedKey) if err != nil { return *instanceKey, false, log.Errore(err) } if instance.IsBinlogServer() && config.Config.SkipBinlogServerUnresolveCheck { // Do nothing. Everything is assumed to be fine. } else if instance.Key.Hostname != instanceKey.Hostname { // Resolve(Unresolve(hostname)) != hostname ==> Bad; reject if *config.RuntimeCLIFlags.SkipUnresolveCheck { return *instanceKey, false, nil } return *instanceKey, false, log.Errorf("Error unresolving; hostname=%s, unresolved=%s, re-resolved=%s; mismatch. Skip/ignore with --skip-unresolve-check", instanceKey.Hostname, unresolvedKey.Hostname, instance.Key.Hostname) } return *unresolvedKey, true, nil }
// acknowledgeRecoveries sets acknowledged* details and clears the in_active_period flags from a set of entries func acknowledgeRecoveries(owner string, comment string, markEndRecovery bool, whereClause string, args []interface{}) (countAcknowledgedEntries int64, err error) { additionalSet := `` if markEndRecovery { additionalSet = ` end_recovery=IFNULL(end_recovery, NOW()), ` } query := fmt.Sprintf(` update topology_recovery set in_active_period = 0, end_active_period_unixtime = IF(end_active_period_unixtime = 0, UNIX_TIMESTAMP(), end_active_period_unixtime), %s acknowledged = 1, acknowledged_at = NOW(), acknowledged_by = ?, acknowledge_comment = ? where acknowledged = 0 and %s `, additionalSet, whereClause) args = append(sqlutils.Args(owner, comment), args...) sqlResult, err := db.ExecOrchestrator(query, args...) if err != nil { return 0, log.Errore(err) } rows, err := sqlResult.RowsAffected() return rows, log.Errore(err) }
// ExpireBlockedRecoveries clears listing of blocked recoveries that are no longer actually blocked. func ExpireBlockedRecoveries() error { // Older recovery is acknowledged by now, hence blocked recovery should be released. // Do NOTE that the data in blocked_topology_recovery is only used for auditing: it is NOT the data // based on which we make automated decisions. _, err := db.ExecOrchestrator(` delete from blocked_topology_recovery using blocked_topology_recovery left join topology_recovery on (blocking_recovery_id = topology_recovery.recovery_id and acknowledged = 0) where acknowledged is null `, ) if err != nil { return log.Errore(err) } // Some oversampling, if a problem has not been noticed for some time (e.g. the server came up alive // before action was taken), expire it. // Recall that RegisterBlockedRecoveries continuously updates the last_blocked_timestamp column. _, err = db.ExecOrchestrator(` delete from blocked_topology_recovery where last_blocked_timestamp < NOW() - interval ? second `, (config.Config.RecoveryPollSeconds * 2), ) if err != nil { return log.Errore(err) } return nil }
// WriteLongRunningProcesses rewrites current state of long running processes for given instance func WriteLongRunningProcesses(instanceKey *InstanceKey, processes []Process) error { writeFunc := func() error { _, err := db.ExecOrchestrator(` delete from database_instance_long_running_queries where hostname = ? and port = ? `, instanceKey.Hostname, instanceKey.Port) if err != nil { return log.Errore(err) } for _, process := range processes { _, merr := db.ExecOrchestrator(` insert into database_instance_long_running_queries ( hostname, port, process_id, process_started_at, process_user, process_host, process_db, process_command, process_time_seconds, process_state, process_info ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, instanceKey.Hostname, instanceKey.Port, process.Id, process.StartedAt, process.User, process.Host, process.Db, process.Command, process.Time, process.State, process.Info, ) if merr != nil { err = merr } } if err != nil { return log.Errore(err) } return nil } return ExecDBWriteFunc(writeFunc) }
// RestartSlave stops & starts replication on a given instance func RestartSlave(instanceKey *InstanceKey) (instance *Instance, err error) { instance, err = StopSlave(instanceKey) if err != nil { return instance, log.Errore(err) } instance, err = StartSlave(instanceKey) if err != nil { return instance, log.Errore(err) } return instance, nil }
// CommandRun executes a command func CommandRun(commandText string, arguments ...string) error { cmd, tmpFileName, err := execCmd(commandText, arguments...) defer os.Remove(tmpFileName) if err != nil { return log.Errore(err) } err = cmd.Run() if err != nil { return log.Errore(err) } return nil }
// AuditOperation creates and writes a new audit entry by given params func AuditOperation(auditType string, instanceKey *InstanceKey, message string) error { if instanceKey == nil { instanceKey = &InstanceKey{} } clusterName := "" if instanceKey.Hostname != "" { clusterName, _ = GetClusterName(instanceKey) } if config.Config.AuditLogFile != "" { go func() error { f, err := os.OpenFile(config.Config.AuditLogFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0600) if err != nil { return log.Errore(err) } defer f.Close() text := fmt.Sprintf("%s\t%s\t%s\t%d\t[%s]\t%s\t\n", time.Now().Format(log.TimeFormat), auditType, instanceKey.Hostname, instanceKey.Port, clusterName, message) if _, err = f.WriteString(text); err != nil { return log.Errore(err) } return nil }() } _, err := db.ExecOrchestrator(` insert into audit ( audit_timestamp, audit_type, hostname, port, cluster_name, message ) VALUES ( NOW(), ?, ?, ?, ?, ? ) `, auditType, instanceKey.Hostname, instanceKey.Port, clusterName, message, ) if err != nil { return log.Errore(err) } logMessage := fmt.Sprintf("auditType:%s instance:%s cluster:%s message:%s", auditType, instanceKey.DisplayString(), clusterName, message) if syslogWriter != nil { go func() { syslogWriter.Info(logMessage) }() } log.Debugf(logMessage) auditOperationCounter.Inc(1) return err }
// auditInstanceAnalysisInChangelog will write down an instance's analysis in the database_instance_analysis_changelog table. // To not repeat recurring analysis code, the database_instance_last_analysis table is used, so that only changes to // analysis codes are written. func auditInstanceAnalysisInChangelog(instanceKey *InstanceKey, analysisCode AnalysisCode) error { if lastWrittenAnalysis, found := recentInstantAnalysis.Get(instanceKey.DisplayString()); found { if lastWrittenAnalysis == analysisCode { // Surely nothing new. // And let's expand the timeout recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) return nil } } // Passed the cache; but does database agree that there's a change? Here's a persistent cache; this comes here // to verify no two orchestrator services are doing this without coordinating (namely, one dies, the other taking its place // and has no familiarity of the former's cache) analysisChangeWriteAttemptCounter.Inc(1) sqlResult, err := db.ExecOrchestrator(` insert ignore into database_instance_last_analysis ( hostname, port, analysis_timestamp, analysis ) values ( ?, ?, now(), ? ) on duplicate key update analysis = values(analysis), analysis_timestamp = if(analysis = values(analysis), analysis_timestamp, values(analysis_timestamp)) `, instanceKey.Hostname, instanceKey.Port, string(analysisCode), ) if err != nil { return log.Errore(err) } rows, err := sqlResult.RowsAffected() if err != nil { return log.Errore(err) } recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) lastAnalysisChanged := (rows > 0) if !lastAnalysisChanged { return nil } _, err = db.ExecOrchestrator(` insert into database_instance_analysis_changelog ( hostname, port, analysis_timestamp, analysis ) values ( ?, ?, now(), ? ) `, instanceKey.Hostname, instanceKey.Port, string(analysisCode), ) if err == nil { analysisChangeWriteCounter.Inc(1) } return log.Errore(err) }
func pollAgent(hostname string) error { polledAgent, err := agent.GetAgent(hostname) agent.UpdateAgentLastChecked(hostname) if err != nil { return log.Errore(err) } err = agent.UpdateAgentInfo(hostname, polledAgent) if err != nil { return log.Errore(err) } return nil }
// MasterPosWait issues a MASTER_POS_WAIT() an given instance according to given coordinates. func MasterPosWait(instanceKey *InstanceKey, binlogCoordinates *BinlogCoordinates) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } _, err = ExecInstance(instanceKey, `select master_pos_wait(?, ?)`, binlogCoordinates.LogFile, binlogCoordinates.LogPos) if err != nil { return instance, log.Errore(err) } log.Infof("Instance %+v has reached coordinates: %+v", instanceKey, binlogCoordinates) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// StopSlaveNicely stops a slave such that SQL_thread and IO_thread are aligned (i.e. // SQL_thread consumes all relay log entries) // It will actually START the sql_thread even if the slave is completely stopped. func StopSlaveNicely(instanceKey *InstanceKey, timeout time.Duration) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, fmt.Errorf("instance is not a slave: %+v", instanceKey) } _, err = ExecInstanceNoPrepare(instanceKey, `stop slave io_thread`) _, err = ExecInstanceNoPrepare(instanceKey, `start slave sql_thread`) if instance.SQLDelay == 0 { // Otherwise we don't bother. startTime := time.Now() for upToDate := false; !upToDate; { if timeout > 0 && time.Since(startTime) >= timeout { // timeout return nil, log.Errorf("StopSlaveNicely timeout on %+v", *instanceKey) } instance, err = ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if instance.SQLThreadUpToDate() { upToDate = true } else { time.Sleep(sqlThreadPollDuration) } } } _, err = ExecInstanceNoPrepare(instanceKey, `stop slave`) if err != nil { // Patch; current MaxScale behavior for STOP SLAVE is to throw an error if slave already stopped. if instance.isMaxScale() && err.Error() == "Error 1199: Slave connection is not running" { err = nil } } if err != nil { return instance, log.Errore(err) } instance, err = ReadTopologyInstance(instanceKey) log.Infof("Stopped slave nicely on %+v, Self:%+v, Exec:%+v", *instanceKey, instance.SelfBinlogCoordinates, instance.ExecBinlogCoordinates) return instance, err }
func UpdateClusterAliases() error { writeFunc := func() error { _, err := db.ExecOrchestrator(` replace into cluster_alias (alias, cluster_name, last_registered) select suggested_cluster_alias, substring_index(group_concat(cluster_name order by cluster_name), ',', 1) as cluster_name, NOW() from database_instance left join database_instance_downtime using (hostname, port) where suggested_cluster_alias!='' and not ( (hostname, port) in (select hostname, port from topology_recovery where start_active_period >= now() - interval 11111 day) and ( database_instance_downtime.downtime_active IS NULL or database_instance_downtime.end_timestamp < NOW() ) is false ) group by suggested_cluster_alias `) if err == nil { err = ReadClusterAliases() } return log.Errore(err) } return ExecDBWriteFunc(writeFunc) }
// ReadAgents returns a list of all known agents func ReadAgents() ([]Agent, error) { res := []Agent{} query := ` select hostname, port, token, last_submitted, mysql_port from host_agent order by hostname ` err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { agent := Agent{} agent.Hostname = m.GetString("hostname") agent.Port = m.GetInt("port") agent.MySQLPort = m.GetInt64("mysql_port") agent.Token = "" agent.LastSubmitted = m.GetString("last_submitted") res = append(res, agent) return nil }) if err != nil { log.Errore(err) } return res, err }
// executeAgentCommand requests an agent to execute a command via HTTP api func executeAgentCommand(hostname string, command string, onResponse *func([]byte)) (Agent, error) { agent, token, err := readAgentBasicInfo(hostname) if err != nil { return agent, err } // All seems to be in order. Now make some inquiries from orchestrator-agent service: uri := baseAgentUri(agent.Hostname, agent.Port) var fullCommand string if strings.Contains(command, "?") { fullCommand = fmt.Sprintf("%s&token=%s", command, token) } else { fullCommand = fmt.Sprintf("%s?token=%s", command, token) } log.Debugf("orchestrator-agent command: %s", fullCommand) agentCommandUri := fmt.Sprintf("%s/%s", uri, fullCommand) body, err := readResponse(httpGet(agentCommandUri)) if err != nil { return agent, log.Errore(err) } if onResponse != nil { (*onResponse)(body) } auditAgentOperation("agent-command", &agent, command) return agent, err }
// SeedOperationState reads states for a given seed operation func ReadSeedStates(seedId int64) ([]SeedOperationState, error) { res := []SeedOperationState{} query := ` select agent_seed_state_id, agent_seed_id, state_timestamp, state_action, error_message from agent_seed_state where agent_seed_id = ? order by agent_seed_state_id desc ` err := db.QueryOrchestrator(query, sqlutils.Args(seedId), func(m sqlutils.RowMap) error { seedState := SeedOperationState{} seedState.SeedStateId = m.GetInt64("agent_seed_state_id") seedState.SeedId = m.GetInt64("agent_seed_id") seedState.StateTimestamp = m.GetString("state_timestamp") seedState.Action = m.GetString("state_action") seedState.ErrorMessage = m.GetString("error_message") res = append(res, seedState) return nil }) if err != nil { log.Errore(err) } return res, err }
func InitGraphiteMetrics() error { if config.Config.GraphiteAddr == "" { return nil } if config.Config.GraphitePath == "" { return log.Errorf("No graphite path provided (see GraphitePath config variable). Will not log to graphite") } addr, err := net.ResolveTCPAddr("tcp", config.Config.GraphiteAddr) if err != nil { return log.Errore(err) } graphitePathHostname := process.ThisHostname if config.Config.GraphiteConvertHostnameDotsToUnderscores { graphitePathHostname = strings.Replace(graphitePathHostname, ".", "_", -1) } graphitePath := config.Config.GraphitePath graphitePath = strings.Replace(graphitePath, "{hostname}", graphitePathHostname, -1) log.Debugf("Will log to graphite on %+v, %+v", config.Config.GraphiteAddr, graphitePath) go func() { go graphite.Graphite(metrics.DefaultRegistry, 1*time.Minute, graphitePath, addr) for range graphiteCallbackTick { for _, f := range graphiteTickCallbacks { go f() } } }() return nil }
// WriteHostnameUnresolve upserts an entry in hostname_unresolve func WriteHostnameUnresolve(instanceKey *InstanceKey, unresolvedHostname string) error { writeFunc := func() error { _, err := db.ExecOrchestrator(` insert into hostname_unresolve ( hostname, unresolved_hostname, last_registered) values (?, ?, NOW()) on duplicate key update unresolved_hostname=values(unresolved_hostname), last_registered=now() `, instanceKey.Hostname, unresolvedHostname, ) if err != nil { return log.Errore(err) } _, err = db.ExecOrchestrator(` replace into hostname_unresolve_history ( hostname, unresolved_hostname, last_registered) values (?, ?, NOW()) `, instanceKey.Hostname, unresolvedHostname, ) writeUnresolvedHostnameCounter.Inc(1) return nil } return ExecDBWriteFunc(writeFunc) }
// DeleteInvalidHostnameResolves removes invalid resolves. At this time these are: // - infinite loop resolves (A->B and B->A), remove earlier mapping func DeleteInvalidHostnameResolves() error { var invalidHostnames []string query := ` select early.hostname from hostname_resolve as latest join hostname_resolve early on (latest.resolved_hostname = early.hostname and latest.hostname = early.resolved_hostname) where latest.hostname != latest.resolved_hostname and latest.resolved_timestamp > early.resolved_timestamp ` err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { invalidHostnames = append(invalidHostnames, m.GetString("hostname")) return nil }) if err != nil { return err } for _, invalidHostname := range invalidHostnames { _, err = db.ExecOrchestrator(` delete from hostname_resolve where hostname = ?`, invalidHostname, ) log.Errore(err) } return err }
// FlushBinaryLogsTo attempts to 'PURGE BINARY LOGS' until given binary log is reached func PurgeBinaryLogsToCurrent(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } return PurgeBinaryLogsTo(instanceKey, instance.SelfBinlogCoordinates.LogFile) }
// queryResultData returns a raw array of rows for a given query, optionally reading and returning column names func queryResultData(db *sql.DB, query string, retrieveColumns bool, args ...interface{}) (ResultData, []string, error) { var err error defer func() { if derr := recover(); derr != nil { err = errors.New(fmt.Sprintf("QueryRowsMap unexpected error: %+v", derr)) } }() columns := []string{} rows, err := db.Query(query, args...) defer rows.Close() if err != nil && err != sql.ErrNoRows { return EmptyResultData, columns, log.Errore(err) } if retrieveColumns { // Don't pay if you don't want to columns, _ = rows.Columns() } resultData := ResultData{} err = ScanRowsToArrays(rows, func(rowData []CellData) error { resultData = append(resultData, rowData) return nil }) return resultData, columns, err }
// CheckAndRecover is the main entry point for the recovery mechanism func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedSlaveKey *inst.InstanceKey, err error) { replicationAnalysis, err := inst.GetReplicationAnalysis("", true, true) if err != nil { return false, nil, log.Errore(err) } if *config.RuntimeCLIFlags.Noop { log.Debugf("--noop provided; will not execute processes") skipProcesses = true } for _, analysisEntry := range replicationAnalysis { if specificInstance != nil { // We are looking for a specific instance; if this is not the one, skip! if !specificInstance.Equals(&analysisEntry.AnalyzedInstanceKey) { continue } } if analysisEntry.IsDowntimed && specificInstance == nil { // Only recover a downtimed server if explicitly requested continue } if specificInstance != nil { // force mode. Keep it synchronuous var topologyRecovery *TopologyRecovery recoveryAttempted, topologyRecovery, err = executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) if topologyRecovery != nil { promotedSlaveKey = topologyRecovery.SuccessorKey } } else { go executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, false, skipProcesses) } } return recoveryAttempted, promotedSlaveKey, err }
// readRecoveries reads recovery entry/audit entires from topology_recovery func ReadReplicationAnalysisChangelog() ([]ReplicationAnalysisChangelog, error) { res := []ReplicationAnalysisChangelog{} query := ` select hostname, port, group_concat(analysis_timestamp,';',analysis order by changelog_id) as changelog from database_instance_analysis_changelog group by hostname, port ` err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { analysisChangelog := ReplicationAnalysisChangelog{} analysisChangelog.AnalyzedInstanceKey.Hostname = m.GetString("hostname") analysisChangelog.AnalyzedInstanceKey.Port = m.GetInt("port") analysisChangelog.Changelog = m.GetString("changelog") res = append(res, analysisChangelog) return nil }) if err != nil { log.Errore(err) } return res, err }
func readAvailableNodes(onlyHttpNodes bool) ([]string, error) { res := []string{} extraInfo := "" if onlyHttpNodes { extraInfo = string(OrchestratorExecutionHttpMode) } query := ` select concat(hostname, ';', token) as node from node_health where last_seen_active > now() - interval ? second and ? in (extra_info, '') order by hostname ` err := db.QueryOrchestrator(query, sqlutils.Args(registrationPollSeconds*2, extraInfo), func(m sqlutils.RowMap) error { res = append(res, m.GetString("node")) return nil }) if err != nil { log.Errore(err) } return res, err }