// ExpireBlockedRecoveries clears listing of blocked recoveries that are no longer actually blocked. func ExpireBlockedRecoveries() error { // Older recovery is acknowledged by now, hence blocked recovery should be released. // Do NOTE that the data in blocked_topology_recovery is only used for auditing: it is NOT the data // based on which we make automated decisions. _, err := db.ExecOrchestrator(` delete from blocked_topology_recovery using blocked_topology_recovery left join topology_recovery on (blocking_recovery_id = topology_recovery.recovery_id and acknowledged = 0) where acknowledged is null `, ) if err != nil { return log.Errore(err) } // Some oversampling, if a problem has not been noticed for some time (e.g. the server came up alive // before action was taken), expire it. // Recall that RegisterBlockedRecoveries continuously updated the last_blocked_timestamp column. _, err = db.ExecOrchestrator(` delete from blocked_topology_recovery where last_blocked_timestamp < NOW() - interval ? second `, (config.Config.RecoveryPollSeconds * 5), ) if err != nil { return log.Errore(err) } return nil }
// WriteHostnameUnresolve upserts an entry in hostname_unresolve func WriteHostnameUnresolve(instanceKey *InstanceKey, unresolvedHostname string) error { writeFunc := func() error { _, err := db.ExecOrchestrator(` insert into hostname_unresolve ( hostname, unresolved_hostname, last_registered) values (?, ?, NOW()) on duplicate key update unresolved_hostname=values(unresolved_hostname), last_registered=now() `, instanceKey.Hostname, unresolvedHostname, ) if err != nil { return log.Errore(err) } _, err = db.ExecOrchestrator(` replace into hostname_unresolve_history ( hostname, unresolved_hostname, last_registered) values (?, ?, NOW()) `, instanceKey.Hostname, unresolvedHostname, ) writeUnresolvedHostnameCounter.Inc(1) return nil } return ExecDBWriteFunc(writeFunc) }
// RegisterNode writes down this node in the node_health table func RegisterNode(extraInfo string, command string, firstTime bool) (sql.Result, error) { if firstTime { db.ExecOrchestrator(` insert ignore into node_health_history (hostname, token, first_seen_active, extra_info, command, app_version) values (?, ?, NOW(), ?, ?, ?) `, ThisHostname, ProcessToken.Hash, extraInfo, command, config.RuntimeCLIFlags.ConfiguredVersion, ) } return db.ExecOrchestrator(` insert into node_health (hostname, token, last_seen_active, extra_info, command, app_version) values (?, ?, NOW(), ?, ?, ?) on duplicate key update token=values(token), last_seen_active=values(last_seen_active), extra_info=if(values(extra_info) != '', values(extra_info), extra_info), app_version=values(app_version) `, ThisHostname, ProcessToken.Hash, extraInfo, command, config.RuntimeCLIFlags.ConfiguredVersion, ) }
// ExpireMaintenance will remove the maintenance flag on old maintenances and on bounded maintenances func ExpireMaintenance() error { { res, err := db.ExecOrchestrator(` delete from database_instance_maintenance where maintenance_active is null and end_timestamp < NOW() - INTERVAL ? DAY `, config.Config.MaintenancePurgeDays, ) if err != nil { return log.Errore(err) } if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { AuditOperation("expire-maintenance", nil, fmt.Sprintf("Purged historical entries: %d", rowsAffected)) } } { res, err := db.ExecOrchestrator(` update database_instance_maintenance set maintenance_active = NULL where maintenance_active = 1 and end_timestamp < NOW() `, ) if err != nil { return log.Errore(err) } if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { AuditOperation("expire-maintenance", nil, fmt.Sprintf("Expired bounded: %d", rowsAffected)) } } { res, err := db.ExecOrchestrator(` update database_instance_maintenance left join node_health on (processing_node_hostname = node_health.hostname AND processing_node_token = node_health.token) set database_instance_maintenance.maintenance_active = NULL where node_health.last_seen_active IS NULL and explicitly_bounded = 0 `, ) if err != nil { return log.Errore(err) } if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { AuditOperation("expire-maintenance", nil, fmt.Sprintf("Expired dead: %d", rowsAffected)) } } return nil }
// WriteLongRunningProcesses rewrites current state of long running processes for given instance func WriteLongRunningProcesses(instanceKey *InstanceKey, processes []Process) error { writeFunc := func() error { _, err := db.ExecOrchestrator(` delete from database_instance_long_running_queries where hostname = ? and port = ? `, instanceKey.Hostname, instanceKey.Port) if err != nil { return log.Errore(err) } for _, process := range processes { _, merr := db.ExecOrchestrator(` insert into database_instance_long_running_queries ( hostname, port, process_id, process_started_at, process_user, process_host, process_db, process_command, process_time_seconds, process_state, process_info ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, instanceKey.Hostname, instanceKey.Port, process.Id, process.StartedAt, process.User, process.Host, process.Db, process.Command, process.Time, process.State, process.Info, ) if merr != nil { err = merr } } if err != nil { return log.Errore(err) } return nil } return ExecDBWriteFunc(writeFunc) }
// auditInstanceAnalysisInChangelog will write down an instance's analysis in the database_instance_analysis_changelog table. // To not repeat recurring analysis code, the database_instance_last_analysis table is used, so that only changes to // analysis codes are written. func auditInstanceAnalysisInChangelog(instanceKey *InstanceKey, analysisCode AnalysisCode) error { if lastWrittenAnalysis, found := recentInstantAnalysis.Get(instanceKey.DisplayString()); found { if lastWrittenAnalysis == analysisCode { // Surely nothing new. // And let's expand the timeout recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) return nil } } // Passed the cache; but does database agree that there's a change? Here's a persistent cache; this comes here // to verify no two orchestrator services are doing this without coordinating (namely, one dies, the other taking its place // and has no familiarity of the former's cache) analysisChangeWriteAttemptCounter.Inc(1) sqlResult, err := db.ExecOrchestrator(` insert ignore into database_instance_last_analysis ( hostname, port, analysis_timestamp, analysis ) values ( ?, ?, now(), ? ) on duplicate key update analysis = values(analysis), analysis_timestamp = if(analysis = values(analysis), analysis_timestamp, values(analysis_timestamp)) `, instanceKey.Hostname, instanceKey.Port, string(analysisCode), ) if err != nil { return log.Errore(err) } rows, err := sqlResult.RowsAffected() if err != nil { return log.Errore(err) } recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) lastAnalysisChanged := (rows > 0) if !lastAnalysisChanged { return nil } _, err = db.ExecOrchestrator(` insert into database_instance_analysis_changelog ( hostname, port, analysis_timestamp, analysis ) values ( ?, ?, now(), ? ) `, instanceKey.Hostname, instanceKey.Port, string(analysisCode), ) if err == nil { analysisChangeWriteCounter.Inc(1) } return log.Errore(err) }
func (s *TestSuite) TestDiscover(c *C) { var err error _, err = db.ExecOrchestrator("delete from database_instance where hostname = ? and port = ?", masterKey.Hostname, masterKey.Port) _, err = db.ExecOrchestrator("delete from database_instance where hostname = ? and port = ?", slave1Key.Hostname, slave1Key.Port) _, err = db.ExecOrchestrator("delete from database_instance where hostname = ? and port = ?", slave2Key.Hostname, slave2Key.Port) _, err = db.ExecOrchestrator("delete from database_instance where hostname = ? and port = ?", slave3Key.Hostname, slave3Key.Port) _, found, _ := ReadInstance(&masterKey) c.Assert(found, Equals, false) _, _ = ReadTopologyInstance(&slave1Key) _, found, err = ReadInstance(&slave1Key) c.Assert(found, Equals, true) c.Assert(err, IsNil) }
// deleteHostnameResolves compeltely erases the database cache func deleteHostnameResolves() error { _, err := db.ExecOrchestrator(` delete from hostname_resolve`, ) return err }
// DeleteInvalidHostnameResolves removes invalid resolves. At this time these are: // - infinite loop resolves (A->B and B->A), remove earlier mapping func DeleteInvalidHostnameResolves() error { var invalidHostnames []string query := ` select early.hostname from hostname_resolve as latest join hostname_resolve early on (latest.resolved_hostname = early.hostname and latest.hostname = early.resolved_hostname) where latest.hostname != latest.resolved_hostname and latest.resolved_timestamp > early.resolved_timestamp ` err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { invalidHostnames = append(invalidHostnames, m.GetString("hostname")) return nil }) if err != nil { return err } for _, invalidHostname := range invalidHostnames { _, err = db.ExecOrchestrator(` delete from hostname_resolve where hostname = ?`, invalidHostname, ) log.Errore(err) } return err }
// EnableRecovery ensures recoveries are enabled globally func EnableRecovery() error { _, err := db.ExecOrchestrator(` DELETE FROM global_recovery_disable `, ) return err }
// ResolveRecovery is called on completion of a recovery process and updates the recovery status. // It does not clear the "active period" as this still takes place in order to avoid flapping. func ResolveRecovery(topologyRecovery *TopologyRecovery, successorInstance *inst.Instance) error { isSuccessful := false var successorKeyToWrite inst.InstanceKey if successorInstance != nil { topologyRecovery.SuccessorKey = &successorInstance.Key isSuccessful = true successorKeyToWrite = successorInstance.Key } _, err := db.ExecOrchestrator(` update topology_recovery set is_successful = ?, successor_hostname = ?, successor_port = ?, lost_slaves = ?, participating_instances = ?, all_errors = ?, end_recovery = NOW() where recovery_id = ? AND in_active_period = 1 AND processing_node_hostname = ? AND processcing_node_token = ? `, isSuccessful, successorKeyToWrite.Hostname, successorKeyToWrite.Port, topologyRecovery.LostSlaves.ToCommaDelimitedList(), topologyRecovery.ParticipatingInstanceKeys.ToCommaDelimitedList(), strings.Join(topologyRecovery.AllErrors, "\n"), topologyRecovery.Id, process.ThisHostname, process.ProcessToken.Hash, ) return log.Errore(err) }
// UpdateClusterAliases writes down the cluster_alias table based on information // gained from database_instance func UpdateClusterAliases() error { writeFunc := func() error { _, err := db.ExecOrchestrator(` replace into cluster_alias (alias, cluster_name, last_registered) select suggested_cluster_alias, substring_index(group_concat( cluster_name order by ((last_checked <= last_seen) is true) desc, read_only asc, num_slave_hosts desc ), ',', 1) as cluster_name, NOW() from database_instance left join database_instance_downtime using (hostname, port) where suggested_cluster_alias!='' /* exclude newly demoted, downtimed masters */ and ifnull( database_instance_downtime.downtime_active = 1 and database_instance_downtime.end_timestamp > now() and database_instance_downtime.reason = ? , false) is false group by suggested_cluster_alias `, DowntimeLostInRecoveryMessage) return log.Errore(err) } return ExecDBWriteFunc(writeFunc) }
func UpdateClusterAliases() error { writeFunc := func() error { _, err := db.ExecOrchestrator(` replace into cluster_alias (alias, cluster_name, last_registered) select suggested_cluster_alias, substring_index(group_concat(cluster_name order by cluster_name), ',', 1) as cluster_name, NOW() from database_instance left join database_instance_downtime using (hostname, port) where suggested_cluster_alias!='' and not ( (hostname, port) in (select hostname, port from topology_recovery where start_active_period >= now() - interval 11111 day) and ( database_instance_downtime.downtime_active IS NULL or database_instance_downtime.end_timestamp < NOW() ) is false ) group by suggested_cluster_alias `) if err == nil { err = ReadClusterAliases() } return log.Errore(err) } return ExecDBWriteFunc(writeFunc) }
// acknowledgeRecoveries sets acknowledged* details and clears the in_active_period flags from a set of entries func acknowledgeRecoveries(owner string, comment string, markEndRecovery bool, whereClause string, args []interface{}) (countAcknowledgedEntries int64, err error) { additionalSet := `` if markEndRecovery { additionalSet = ` end_recovery=IFNULL(end_recovery, NOW()), ` } query := fmt.Sprintf(` update topology_recovery set in_active_period = 0, end_active_period_unixtime = IF(end_active_period_unixtime = 0, UNIX_TIMESTAMP(), end_active_period_unixtime), %s acknowledged = 1, acknowledged_at = NOW(), acknowledged_by = ?, acknowledge_comment = ? where acknowledged = 0 and %s `, additionalSet, whereClause) args = append(sqlutils.Args(owner, comment), args...) sqlResult, err := db.ExecOrchestrator(query, args...) if err != nil { return 0, log.Errore(err) } rows, err := sqlResult.RowsAffected() return rows, log.Errore(err) }
// BeginDowntime will make mark an instance as downtimed (or override existing downtime period) func BeginDowntime(instanceKey *InstanceKey, owner string, reason string, durationSeconds uint) error { if durationSeconds == 0 { durationSeconds = config.Config.MaintenanceExpireMinutes * 60 } _, err := db.ExecOrchestrator(` insert into database_instance_downtime ( hostname, port, downtime_active, begin_timestamp, end_timestamp, owner, reason ) VALUES ( ?, ?, 1, NOW(), NOW() + INTERVAL ? SECOND, ?, ? ) on duplicate key update downtime_active=values(downtime_active), begin_timestamp=values(begin_timestamp), end_timestamp=values(end_timestamp), owner=values(owner), reason=values(reason) `, instanceKey.Hostname, instanceKey.Port, durationSeconds, owner, reason, ) if err != nil { return log.Errore(err) } AuditOperation("begin-downtime", instanceKey, fmt.Sprintf("owner: %s, reason: %s", owner, reason)) return nil }
// BeginBoundedMaintenance will make new maintenance entry for given instanceKey. func BeginBoundedMaintenance(instanceKey *InstanceKey, owner string, reason string, durationSeconds uint) (int64, error) { var maintenanceToken int64 = 0 if durationSeconds == 0 { durationSeconds = config.Config.MaintenanceExpireMinutes * 60 } res, err := db.ExecOrchestrator(` insert ignore into database_instance_maintenance ( hostname, port, maintenance_active, begin_timestamp, end_timestamp, owner, reason ) VALUES ( ?, ?, 1, NOW(), NOW() + INTERVAL ? SECOND, ?, ? ) `, instanceKey.Hostname, instanceKey.Port, durationSeconds, owner, reason, ) if err != nil { return maintenanceToken, log.Errore(err) } if affected, _ := res.RowsAffected(); affected == 0 { err = fmt.Errorf("Cannot begin maintenance for instance: %+v; maintenance reason: %+v", instanceKey, reason) } else { // success maintenanceToken, _ = res.LastInsertId() AuditOperation("begin-maintenance", instanceKey, fmt.Sprintf("maintenanceToken: %d, owner: %s, reason: %s", maintenanceToken, owner, reason)) } return maintenanceToken, err }
// EndMaintenanceByInstanceKey will terminate an active maintenance using given instanceKey as hint func EndMaintenanceByInstanceKey(instanceKey *InstanceKey) error { res, err := db.ExecOrchestrator(` update database_instance_maintenance set maintenance_active = NULL, end_timestamp = NOW() where hostname = ? and port = ? and maintenance_active = 1 `, instanceKey.Hostname, instanceKey.Port, ) if err != nil { return log.Errore(err) } if affected, _ := res.RowsAffected(); affected == 0 { err = fmt.Errorf("Instance is not in maintenance mode: %+v", instanceKey) } else { // success AuditOperation("end-maintenance", instanceKey, "") } return err }
// AuditInstanceAnalysisInChangelog will write down an instance's analysis in the database_instance_analysis_changelog table. // To not repeat recurring analysis code, the database_instance_last_analysis table is used, so that only changes to // analysis codes are written. func AuditInstanceAnalysisInChangelog(instanceKey *InstanceKey, analysisCode AnalysisCode) error { if lastWrittenAnalysis, found := recentInstantAnalysis.Get(instanceKey.DisplayString()); found { if lastWrittenAnalysis == analysisCode { // Surely nothing new. return nil } } sqlResult, err := db.ExecOrchestrator(` insert ignore into database_instance_last_analysis ( hostname, port, analysis_timestamp, analysis ) values ( ?, ?, now(), ? ) on duplicate key update analysis = values(analysis), analysis_timestamp = if(analysis = values(analysis), analysis_timestamp, values(analysis_timestamp)) `, instanceKey.Hostname, instanceKey.Port, string(analysisCode), ) if err != nil { return log.Errore(err) } rows, err := sqlResult.RowsAffected() if err != nil { return log.Errore(err) } lastAnalysisChanged := (rows > 0) if !lastAnalysisChanged { return nil } _, err = db.ExecOrchestrator(` insert into database_instance_analysis_changelog ( hostname, port, analysis_timestamp, analysis ) values ( ?, ?, now(), ? ) `, instanceKey.Hostname, instanceKey.Port, string(analysisCode), ) if err == nil { recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) } return log.Errore(err) }
// DisableRecovery ensures recoveries are disabled globally func DisableRecovery() error { _, err := db.ExecOrchestrator(` INSERT IGNORE INTO global_recovery_disable (disable_recovery) VALUES (1) `, ) return err }
// ForgetLongUnseenAgents will remove entries of all agents that have long since been last seen. func ForgetLongUnseenAgents() error { _, err := db.ExecOrchestrator(` delete from host_agent where last_submitted < NOW() - interval ? hour`, config.Config.UnseenAgentForgetHours, ) return err }
// ForgetExpiredHostnameResolves func ForgetExpiredHostnameResolves() error { _, err := db.ExecOrchestrator(` delete from hostname_resolve where resolved_timestamp < NOW() - interval (? * 2) minute`, config.Config.ExpiryHostnameResolvesMinutes, ) return err }
// ExpireNodesHistory cleans up the nodes history func ExpireNodesHistory() error { _, err := db.ExecOrchestrator(` delete from node_health_history where first_seen_active < now() - interval ? hour `, config.Config.UnseenInstanceForgetHours, ) return log.Errore(err) }
// ExpireMasterPositionEquivalence expires old master_position_equivalence func ExpireMasterPositionEquivalence() error { writeFunc := func() error { _, err := db.ExecOrchestrator(` delete from master_position_equivalence where last_suggested < NOW() - INTERVAL ? HOUR `, config.Config.UnseenInstanceForgetHours, ) return log.Errore(err) } return ExecDBWriteFunc(writeFunc) }
// expireAvailableNodes is an aggressive puring method to remove node entries who have skipped // their keepalive for two times func expireAvailableNodes() error { _, err := db.ExecOrchestrator(` delete from node_health where last_seen_active < now() - interval ? second `, registrationPollSeconds*2, ) return log.Errore(err) }
// ExpireInstanceAnalysisChangelog removes old-enough analysis entries from the changelog func ExpireInstanceAnalysisChangelog() error { _, err := db.ExecOrchestrator(` delete from database_instance_analysis_changelog where analysis_timestamp < now() - interval ? hour `, config.Config.UnseenInstanceForgetHours, ) return log.Errore(err) }
// DeregisterHostnameUnresolve removes an unresovle entry func DeregisterHostnameUnresolve(instanceKey *InstanceKey) error { writeFunc := func() error { _, err := db.ExecOrchestrator(` delete from hostname_unresolve where hostname=? `, instanceKey.Hostname, ) return log.Errore(err) } return ExecDBWriteFunc(writeFunc) }
// ExpirePoolInstances cleans up the database_instance_pool table from expired items func ExpirePoolInstances() error { _, err := db.ExecOrchestrator(` delete from database_instance_pool where registered_at < now() - interval ? minute `, config.Config.InstancePoolExpiryMinutes, ) return log.Errore(err) }
// ExpireHostnameUnresolve expires hostname_unresolve entries that haven't been updated recently. func ExpireHostnameUnresolve() error { writeFunc := func() error { _, err := db.ExecOrchestrator(` delete from hostname_unresolve where last_registered < NOW() - INTERVAL ? MINUTE `, config.Config.ExpiryHostnameResolvesMinutes, ) return log.Errore(err) } return ExecDBWriteFunc(writeFunc) }
// ExpireAccessTokens removes old, known to be uneligible tokens func ExpireAccessTokens() error { _, err := db.ExecOrchestrator(` delete from access_token where generated_at < now() - interval ? minute and is_reentrant = 0 `, config.Config.AccessTokenExpiryMinutes, ) return log.Errore(err) }
// clearAcknowledgedFailureDetections clears the "in_active_period" flag for detections // that were acknowledged func clearAcknowledgedFailureDetections(whereClause string, args []interface{}) error { query := fmt.Sprintf(` update topology_failure_detection set in_active_period = 0, end_active_period_unixtime = UNIX_TIMESTAMP() where in_active_period = 1 and %s `, whereClause) _, err := db.ExecOrchestrator(query, args...) return log.Errore(err) }