// Return the next chunk of binlog events; skip to next binary log file if need be; return empty result only // if reached end of binary logs func getNextBinlogEventsChunk(instance *Instance, startingCoordinates BinlogCoordinates, numEmptyBinlogs int) ([]BinlogEvent, error) { if numEmptyBinlogs > maxEmptyBinlogFiles { log.Debugf("Reached maxEmptyBinlogFiles (%d) at %+v", maxEmptyBinlogFiles, startingCoordinates) // Give up and return empty results return []BinlogEvent{}, nil } coordinatesExceededCurrent := false switch startingCoordinates.Type { case BinaryLog: coordinatesExceededCurrent = instance.SelfBinlogCoordinates.FileSmallerThan(&startingCoordinates) case RelayLog: coordinatesExceededCurrent = instance.RelaylogCoordinates.FileSmallerThan(&startingCoordinates) } if coordinatesExceededCurrent { // We're past the last file. This is a non-error: there are no more events. log.Debugf("Coordinates overflow: %+v; terminating search", startingCoordinates) return []BinlogEvent{}, nil } events, err := readBinlogEventsChunk(&instance.Key, startingCoordinates) if err != nil { return events, err } if len(events) > 0 { log.Debugf("Returning %d events at %+v", len(events), startingCoordinates) return events, nil } // events are empty if nextCoordinates, err := instance.GetNextBinaryLog(startingCoordinates); err == nil { log.Debugf("Recursing into %+v", nextCoordinates) return getNextBinlogEventsChunk(instance, nextCoordinates, numEmptyBinlogs+1) } // on error return events, err }
// deployIfNotAlreadyDeployed will issue given sql queries that are not already known to be deployed. // This iterates both lists (to-run and already-deployed) and also verifies no contraditions. func deployIfNotAlreadyDeployed(db *sql.DB, queries []string, deployedQueries []string, deploymentType string, fatalOnError bool) error { tx, err := db.Begin() if err != nil { log.Fatale(err) } // Ugly workaround ahead. // Origin of this workaround is the existence of some "timestamp NOT NULL," column definitions, // where in NO_ZERO_IN_DATE,NO_ZERO_DATE sql_mode are invalid (since default is implicitly "0") // This means installation of orchestrator fails on such configured servers, and in particular on 5.7 // where this setting is the dfault. // For purpose of backwards compatability, what we do is force sql_mode to be more relaxed, create the schemas // along with the "invalid" definition, and then go ahead and fix those definitions via following ALTER statements. // My bad. originalSqlMode := "" err = tx.QueryRow(`select @@session.sql_mode`).Scan(&originalSqlMode) if _, err := tx.Exec(`set @@session.sql_mode=REPLACE(@@session.sql_mode, 'NO_ZERO_DATE', '')`); err != nil { log.Fatale(err) } if _, err := tx.Exec(`set @@session.sql_mode=REPLACE(@@session.sql_mode, 'NO_ZERO_IN_DATE', '')`); err != nil { log.Fatale(err) } for i, query := range queries { queryAlreadyExecuted := false // While iterating 'queries', also iterate 'deployedQueries'. Expect identity if len(deployedQueries) > i { if deployedQueries[i] != query { log.Fatalf("initOrchestratorDB() PANIC: non matching %s queries between deployment requests and _orchestrator_db_deployment. Please execute 'orchestrator -c reset-internal-db-deployment'", deploymentType) } queryAlreadyExecuted = true } if queryAlreadyExecuted { continue } if i == 0 { log.Debugf("sql_mode is: %+v", originalSqlMode) } if config.Config.SmartOrchestratorDatabaseUpdate { log.Debugf("initOrchestratorDB executing: %.80s", strings.TrimSpace(strings.Replace(query, "\n", "", -1))) } if fatalOnError { if _, err := tx.Exec(query); err != nil { return log.Fatalf("Cannot initiate orchestrator: %+v", err) } } else { tx.Exec(query) // And ignore any error } writeInternalDeployment(db, deploymentType, query, i) } if _, err := tx.Exec(`set session sql_mode=?`, originalSqlMode); err != nil { log.Fatale(err) } if err := tx.Commit(); err != nil { log.Fatale(err) } return nil }
// Attempt to resolve a hostname. This may return a database cached hostname or otherwise // it may resolve the hostname via CNAME func ResolveHostname(hostname string) (string, error) { hostname = strings.TrimSpace(hostname) if hostname == "" { return hostname, errors.New("Will not resolve empty hostname") } if strings.Contains(hostname, ",") { return hostname, fmt.Errorf("Will not resolve multi-hostname: %+v", hostname) } if (&InstanceKey{Hostname: hostname}).IsDetached() { return hostname, fmt.Errorf("Will not resolve detached hostname: %+v", hostname) } // First go to lightweight cache if resolvedHostname, found := hostnameResolvesLightweightCache.Get(hostname); found { return resolvedHostname.(string), nil } if !hostnameResolvesLightweightCacheLoadedOnceFromDB { // A continuous-discovery will first make sure to load all resolves from DB. // However cli does not do so. // Anyway, it seems like the cache was not loaded from DB. Before doing real resolves, // let's try and get the resolved hostname from database. if !HostnameResolveMethodIsNone() { if resolvedHostname, err := ReadResolvedHostname(hostname); err == nil && resolvedHostname != "" { hostnameResolvesLightweightCache.Set(hostname, resolvedHostname, 0) return resolvedHostname, nil } } } // Unfound: resolve! log.Debugf("Hostname unresolved yet: %s", hostname) resolvedHostname, err := resolveHostname(hostname) if config.Config.RejectHostnameResolvePattern != "" { // Reject, don't even cache if matched, _ := regexp.MatchString(config.Config.RejectHostnameResolvePattern, resolvedHostname); matched { log.Warningf("ResolveHostname: %+v resolved to %+v but rejected due to RejectHostnameResolvePattern '%+v'", hostname, resolvedHostname, config.Config.RejectHostnameResolvePattern) return hostname, nil } } if err != nil { // Problem. What we'll do is cache the hostname for just one minute, so as to avoid flooding requests // on one hand, yet make it refresh shortly on the other hand. Anyway do not write to database. hostnameResolvesLightweightCache.Set(hostname, resolvedHostname, time.Minute) return hostname, err } // Good result! Cache it, also to DB log.Debugf("Cache hostname resolve %s as %s", hostname, resolvedHostname) UpdateResolvedHostname(hostname, resolvedHostname) return resolvedHostname, nil }
func execCmd(commandText string, arguments ...string) (*exec.Cmd, string, error) { commandBytes := []byte(commandText) tmpFile, err := ioutil.TempFile("", "orchestrator-process-cmd-") if err != nil { return nil, "", log.Errore(err) } ioutil.WriteFile(tmpFile.Name(), commandBytes, 0644) log.Debugf("execCmd: %s", commandText) shellArguments := append([]string{}, tmpFile.Name()) shellArguments = append(shellArguments, arguments...) log.Debugf("%+v", shellArguments) return exec.Command("bash", shellArguments...), tmpFile.Name(), nil //return exec.Command(commandText, arguments...) , "", nil }
func InitGraphiteMetrics() error { if config.Config.GraphiteAddr == "" { return nil } if config.Config.GraphitePath == "" { return log.Errorf("No graphite path provided (see GraphitePath config variable). Will not log to graphite") } addr, err := net.ResolveTCPAddr("tcp", config.Config.GraphiteAddr) if err != nil { return log.Errore(err) } graphitePathHostname := process.ThisHostname if config.Config.GraphiteConvertHostnameDotsToUnderscores { graphitePathHostname = strings.Replace(graphitePathHostname, ".", "_", -1) } graphitePath := config.Config.GraphitePath graphitePath = strings.Replace(graphitePath, "{hostname}", graphitePathHostname, -1) log.Debugf("Will log to graphite on %+v, %+v", config.Config.GraphiteAddr, graphitePath) go func() { go graphite.Graphite(metrics.DefaultRegistry, 1*time.Minute, graphitePath, addr) for range graphiteCallbackTick { for _, f := range graphiteTickCallbacks { go f() } } }() return nil }
// checkAndRecoverDeadIntermediateMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. func checkAndRecoverDeadIntermediateMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedIntermediateMasterRecovery) { return false, nil, nil } topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) if topologyRecovery == nil { log.Debugf("topology_recovery: found an active or recent recovery on %+v. Will not issue another RecoverDeadIntermediateMaster.", analysisEntry.AnalyzedInstanceKey) return false, nil, err } // That's it! We must do recovery! recoverDeadIntermediateMasterCounter.Inc(1) promotedSlave, err := RecoverDeadIntermediateMaster(topologyRecovery, skipProcesses) if promotedSlave != nil { // success recoverDeadIntermediateMasterSuccessCounter.Inc(1) if !skipProcesses { // Execute post intermediate-master-failover processes topologyRecovery.SuccessorKey = &promotedSlave.Key executeProcesses(config.Config.PostIntermediateMasterFailoverProcesses, "PostIntermediateMasterFailoverProcesses", topologyRecovery, false) } } else { recoverDeadIntermediateMasterFailureCounter.Inc(1) } return true, topologyRecovery, err }
// SkipQuery skip a single query in a failed replication instance func SkipQuery(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, fmt.Errorf("instance is not a slave: %+v", instanceKey) } if instance.Slave_SQL_Running { return instance, fmt.Errorf("Slave SQL thread is running on %+v", instanceKey) } if instance.LastSQLError == "" { return instance, fmt.Errorf("No SQL error on %+v", instanceKey) } if *config.RuntimeCLIFlags.Noop { return instance, fmt.Errorf("noop: aborting skip-query operation on %+v; signalling error but nothing went wrong.", *instanceKey) } log.Debugf("Skipping one query on %+v", instanceKey) if instance.UsingOracleGTID { err = skipQueryOracleGtid(instance) } else if instance.UsingMariaDBGTID { return instance, log.Errorf("%+v is replicating with MariaDB GTID. To skip a query first disable GTID, then skip, then enable GTID again", *instanceKey) } else { err = skipQueryClassic(instance) } if err != nil { return instance, log.Errore(err) } AuditOperation("skip-query", instanceKey, "Skipped one query") return StartSlave(instanceKey) }
// CheckAndRecover is the main entry point for the recovery mechanism func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedSlaveKey *inst.InstanceKey, err error) { replicationAnalysis, err := inst.GetReplicationAnalysis("", true, true) if err != nil { return false, nil, log.Errore(err) } if *config.RuntimeCLIFlags.Noop { log.Debugf("--noop provided; will not execute processes") skipProcesses = true } for _, analysisEntry := range replicationAnalysis { if specificInstance != nil { // We are looking for a specific instance; if this is not the one, skip! if !specificInstance.Equals(&analysisEntry.AnalyzedInstanceKey) { continue } } if analysisEntry.IsDowntimed && specificInstance == nil { // Only recover a downtimed server if explicitly requested continue } if specificInstance != nil { // force mode. Keep it synchronuous var topologyRecovery *TopologyRecovery recoveryAttempted, topologyRecovery, err = executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) if topologyRecovery != nil { promotedSlaveKey = topologyRecovery.SuccessorKey } } else { go executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, false, skipProcesses) } } return recoveryAttempted, promotedSlaveKey, err }
// executeAgentCommand requests an agent to execute a command via HTTP api func executeAgentCommand(hostname string, command string, onResponse *func([]byte)) (Agent, error) { agent, token, err := readAgentBasicInfo(hostname) if err != nil { return agent, err } // All seems to be in order. Now make some inquiries from orchestrator-agent service: uri := baseAgentUri(agent.Hostname, agent.Port) var fullCommand string if strings.Contains(command, "?") { fullCommand = fmt.Sprintf("%s&token=%s", command, token) } else { fullCommand = fmt.Sprintf("%s?token=%s", command, token) } log.Debugf("orchestrator-agent command: %s", fullCommand) agentCommandUri := fmt.Sprintf("%s/%s", uri, fullCommand) body, err := readResponse(httpGet(agentCommandUri)) if err != nil { return agent, log.Errore(err) } if onResponse != nil { (*onResponse)(body) } auditAgentOperation("agent-command", &agent, command) return agent, err }
// StopSlavesNicely will attemt to stop all given slaves nicely, up to timeout func StopSlavesNicely(slaves [](*Instance), timeout time.Duration) [](*Instance) { refreshedSlaves := [](*Instance){} log.Debugf("Stopping %d slaves nicely", len(slaves)) // use concurrency but wait for all to complete barrier := make(chan *Instance) for _, slave := range slaves { slave := slave go func() { updatedSlave := &slave // Signal completed slave defer func() { barrier <- *updatedSlave }() // Wait your turn to read a slave ExecuteOnTopology(func() { StopSlaveNicely(&slave.Key, timeout) slave, _ = StopSlave(&slave.Key) updatedSlave = &slave }) }() } for range slaves { refreshedSlaves = append(refreshedSlaves, <-barrier) } return refreshedSlaves }
// baseAgentUri returns the base URI for accessing an agent func baseAgentUri(agentHostname string, agentPort int) string { protocol := "http" if config.Config.AgentsUseSSL { protocol = "https" } uri := fmt.Sprintf("%s://%s:%d/api", protocol, agentHostname, agentPort) log.Debugf("orchestrator-agent uri: %s", uri) return uri }
// checkAndRecoverDeadMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. func checkAndRecoverDeadMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedMasterRecovery) { return false, nil, nil } topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) if topologyRecovery == nil { log.Debugf("topology_recovery: found an active or recent recovery on %+v. Will not issue another RecoverDeadMaster.", analysisEntry.AnalyzedInstanceKey) return false, nil, err } // That's it! We must do recovery! log.Debugf("topology_recovery: will handle DeadMaster event on %+v", analysisEntry.ClusterDetails.ClusterName) recoverDeadMasterCounter.Inc(1) promotedSlave, lostSlaves, err := RecoverDeadMaster(topologyRecovery, skipProcesses) topologyRecovery.LostSlaves.AddInstances(lostSlaves) if promotedSlave != nil { promotedSlave, err = replacePromotedSlaveWithCandidate(&analysisEntry.AnalyzedInstanceKey, promotedSlave, candidateInstanceKey) topologyRecovery.AddError(err) } // And this is the end; whether successful or not, we're done. ResolveRecovery(topologyRecovery, promotedSlave) if promotedSlave != nil { // Success! recoverDeadMasterSuccessCounter.Inc(1) if config.Config.ApplyMySQLPromotionAfterMasterFailover { log.Debugf("topology_recovery: - RecoverDeadMaster: will apply MySQL changes to promoted master") inst.ResetSlaveOperation(&promotedSlave.Key) inst.SetReadOnly(&promotedSlave.Key, false) } if !skipProcesses { // Execute post master-failover processes executeProcesses(config.Config.PostMasterFailoverProcesses, "PostMasterFailoverProcesses", topologyRecovery, false) } } else { recoverDeadMasterFailureCounter.Inc(1) } return true, topologyRecovery, err }
// nextEvent will return the next event entry from binary logs; it will automatically skip to next // binary log if need be. // Internally, it uses the cachedEvents array, so that it does not go to the MySQL server upon each call. // Returns nil upon reaching end of binary logs. func (this *BinlogEventCursor) nextEvent(numEmptyEventsEvents int) (*BinlogEvent, error) { if numEmptyEventsEvents > maxEmptyEventsEvents { log.Debugf("End of logs. currentEventIndex: %d, nextCoordinates: %+v", this.currentEventIndex, this.nextCoordinates) // End of logs return nil, nil } if len(this.cachedEvents) == 0 { // Cache exhausted; get next bulk of entries and return the next entry nextFileCoordinates, err := this.nextCoordinates.NextFileCoordinates() if err != nil { return nil, err } log.Debugf("zero cached events, next file: %+v", nextFileCoordinates) this.cachedEvents, err = this.fetchNextEvents(nextFileCoordinates) if err != nil { return nil, err } this.currentEventIndex = -1 // While this seems recursive do note that recursion level is at most 1, since we either have // entires in the next binlog (no further recursion) or we don't (immediate termination) return this.nextEvent(numEmptyEventsEvents + 1) } if this.currentEventIndex+1 < len(this.cachedEvents) { // We have enough cache to go by this.currentEventIndex++ event := &this.cachedEvents[this.currentEventIndex] this.nextCoordinates = event.NextBinlogCoordinates() return event, nil } else { // Cache exhausted; get next bulk of entries and return the next entry var err error this.cachedEvents, err = this.fetchNextEvents(this.cachedEvents[len(this.cachedEvents)-1].NextBinlogCoordinates()) if err != nil { return nil, err } this.currentEventIndex = -1 // While this seems recursive do note that recursion level is at most 1, since we either have // entires in the next binlog (no further recursion) or we don't (immediate termination) return this.nextEvent(numEmptyEventsEvents + 1) } }
// checkAndExecuteFailureDetectionProcesses tries to register for failure detection and potentially executes // failure-detection processes. func checkAndExecuteFailureDetectionProcesses(analysisEntry inst.ReplicationAnalysis, skipProcesses bool) (processesExecutionAttempted bool, err error) { if ok, _ := AttemptFailureDetectionRegistration(&analysisEntry); !ok { return false, nil } log.Debugf("topology_recovery: detected %+v failure on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) // Execute on-detection processes if skipProcesses { return false, nil } err = executeProcesses(config.Config.OnFailureDetectionProcesses, "OnFailureDetectionProcesses", NewTopologyRecovery(analysisEntry), true) return true, err }
// AuditOperation creates and writes a new audit entry by given params func AuditOperation(auditType string, instanceKey *InstanceKey, message string) error { if instanceKey == nil { instanceKey = &InstanceKey{} } clusterName := "" if instanceKey.Hostname != "" { clusterName, _ = GetClusterName(instanceKey) } if config.Config.AuditLogFile != "" { go func() error { f, err := os.OpenFile(config.Config.AuditLogFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0600) if err != nil { return log.Errore(err) } defer f.Close() text := fmt.Sprintf("%s\t%s\t%s\t%d\t[%s]\t%s\t\n", time.Now().Format(log.TimeFormat), auditType, instanceKey.Hostname, instanceKey.Port, clusterName, message) if _, err = f.WriteString(text); err != nil { return log.Errore(err) } return nil }() } _, err := db.ExecOrchestrator(` insert into audit ( audit_timestamp, audit_type, hostname, port, cluster_name, message ) VALUES ( NOW(), ?, ?, ?, ?, ? ) `, auditType, instanceKey.Hostname, instanceKey.Port, clusterName, message, ) if err != nil { return log.Errore(err) } logMessage := fmt.Sprintf("auditType:%s instance:%s cluster:%s message:%s", auditType, instanceKey.DisplayString(), clusterName, message) if syslogWriter != nil { go func() { syslogWriter.Info(logMessage) }() } log.Debugf(logMessage) auditOperationCounter.Inc(1) return err }
// handleDiscoveryRequests iterates the discoveryInstanceKeys channel and calls upon // instance discovery per entry. func handleDiscoveryRequests() { for instanceKey := range discoveryInstanceKeys { // Possibly this used to be the elected node, but has been demoted, while still // the queue is full. // Just don't process the queue when not elected. if isElectedNode { go discoverInstance(instanceKey) } else { log.Debugf("Node apparently demoted. Skipping discovery of %+v. Remaining queue size: %+v", instanceKey, len(discoveryInstanceKeys)) } } }
func postReadAdjustments() { if Config.MySQLOrchestratorCredentialsConfigFile != "" { mySQLConfig := struct { Client struct { User string Password string } }{} err := gcfg.ReadFileInto(&mySQLConfig, Config.MySQLOrchestratorCredentialsConfigFile) if err != nil { log.Fatalf("Failed to parse gcfg data from file: %+v", err) } else { log.Debugf("Parsed orchestrator credentials from %s", Config.MySQLOrchestratorCredentialsConfigFile) Config.MySQLOrchestratorUser = mySQLConfig.Client.User Config.MySQLOrchestratorPassword = mySQLConfig.Client.Password } } if Config.MySQLTopologyCredentialsConfigFile != "" { mySQLConfig := struct { Client struct { User string Password string } }{} err := gcfg.ReadFileInto(&mySQLConfig, Config.MySQLTopologyCredentialsConfigFile) if err != nil { log.Fatalf("Failed to parse gcfg data from file: %+v", err) } else { log.Debugf("Parsed topology credentials from %s", Config.MySQLTopologyCredentialsConfigFile) Config.MySQLTopologyUser = mySQLConfig.Client.User Config.MySQLTopologyPassword = mySQLConfig.Client.Password } } if Config.RecoveryPeriodBlockSeconds == 0 && Config.RecoveryPeriodBlockMinutes > 0 { // RecoveryPeriodBlockSeconds is a newer addition that overrides RecoveryPeriodBlockMinutes // The code does not consider RecoveryPeriodBlockMinutes anymore, but RecoveryPeriodBlockMinutes // still supported in config file for backwards compatibility Config.RecoveryPeriodBlockSeconds = Config.RecoveryPeriodBlockMinutes * 60 } }
func getLastPseudoGTIDEntryInInstance(instance *Instance, maxBinlogCoordinates *BinlogCoordinates, exhaustiveSearch bool) (*BinlogCoordinates, string, error) { // Look for last GTID in instance: currentBinlog := instance.SelfBinlogCoordinates var err error = nil for err == nil { log.Debugf("Searching for latest pseudo gtid entry in binlog %+v of %+v", currentBinlog.LogFile, instance.Key) resultCoordinates, entryInfo, err := getLastPseudoGTIDEntryInBinlog(&instance.Key, currentBinlog.LogFile, BinaryLog, maxBinlogCoordinates) if err != nil { return nil, "", err } if resultCoordinates != nil { log.Debugf("Found pseudo gtid entry in %+v, %+v", instance.Key, resultCoordinates) return resultCoordinates, entryInfo, err } if !exhaustiveSearch { break } currentBinlog, err = currentBinlog.PreviousFileCoordinates() } return nil, "", log.Errorf("Cannot find pseudo GTID entry in binlogs of %+v", instance.Key) }
func getLastPseudoGTIDEntryInRelayLogs(instance *Instance, recordedInstanceRelayLogCoordinates BinlogCoordinates, exhaustiveSearch bool) (*BinlogCoordinates, string, error) { // Look for last GTID in relay logs: // Since MySQL does not provide with a SHOW RELAY LOGS command, we heuristically srtart from current // relay log (indiciated by Relay_log_file) and walk backwards. // Eventually we will hit a relay log name which does not exist. currentRelayLog := recordedInstanceRelayLogCoordinates var err error = nil for err == nil { log.Debugf("Searching for latest pseudo gtid entry in relaylog %+v of %+v, up to pos %+v", currentRelayLog.LogFile, instance.Key, recordedInstanceRelayLogCoordinates) if resultCoordinates, entryInfo, err := getLastPseudoGTIDEntryInBinlog(&instance.Key, currentRelayLog.LogFile, RelayLog, &recordedInstanceRelayLogCoordinates); err != nil { return nil, "", err } else if resultCoordinates != nil { log.Debugf("Found pseudo gtid entry in %+v, %+v", instance.Key, resultCoordinates) return resultCoordinates, entryInfo, err } if !exhaustiveSearch { break } currentRelayLog, err = currentRelayLog.PreviousFileCoordinates() } return nil, "", log.Errorf("Cannot find pseudo GTID entry in relay logs of %+v", instance.Key) }
// checkAndRecoverDeadCoMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { failedInstanceKey := &analysisEntry.AnalyzedInstanceKey if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedMasterRecovery) { return false, nil, nil } topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) if topologyRecovery == nil { log.Debugf("topology_recovery: found an active or recent recovery on %+v. Will not issue another RecoverDeadCoMaster.", analysisEntry.AnalyzedInstanceKey) return false, nil, err } // That's it! We must do recovery! recoverDeadCoMasterCounter.Inc(1) promotedSlave, lostSlaves, err := RecoverDeadCoMaster(topologyRecovery, skipProcesses) ResolveRecovery(topologyRecovery, promotedSlave) if promotedSlave == nil { inst.AuditOperation("recover-dead-co-master", failedInstanceKey, "Failure: no slave promoted.") } else { inst.AuditOperation("recover-dead-co-master", failedInstanceKey, fmt.Sprintf("promoted: %+v", promotedSlave.Key)) } topologyRecovery.LostSlaves.AddInstances(lostSlaves) if promotedSlave != nil { // success recoverDeadCoMasterSuccessCounter.Inc(1) if config.Config.ApplyMySQLPromotionAfterMasterFailover { log.Debugf("topology_recovery: - RecoverDeadMaster: will apply MySQL changes to promoted master") inst.SetReadOnly(&promotedSlave.Key, false) } if !skipProcesses { // Execute post intermediate-master-failover processes topologyRecovery.SuccessorKey = &promotedSlave.Key executeProcesses(config.Config.PostMasterFailoverProcesses, "PostMasterFailoverProcesses", topologyRecovery, false) } } else { recoverDeadCoMasterFailureCounter.Inc(1) } return true, topologyRecovery, err }
func (this *PostponedFunctionsContainer) InvokePostponed() (err error) { if len(this.PostponedFunctions) == 0 { return } log.Debugf("PostponedFunctionsContainer: invoking %+v postponed functions", len(this.PostponedFunctions)) for _, postponedFunction := range this.PostponedFunctions { ferr := postponedFunction() if err == nil { err = ferr } } return err }
// SearchEntryInInstanceBinlogs will search for a specific text entry within the binary logs of a given instance. func SearchEntryInInstanceBinlogs(instance *Instance, entryText string, monotonicPseudoGTIDEntries bool) (*BinlogCoordinates, error) { cacheKey := getInstanceBinlogEntryKey(instance, entryText) coords, found := instanceBinlogEntryCache.Get(cacheKey) if found { // This is wonderful. We can skip the tedious GTID search in the binary log log.Debugf("Found instance Pseudo GTID entry coordinates in cache: %+v, %+v, %+v", instance.Key, entryText, coords) return coords.(*BinlogCoordinates), nil } // Look for GTID entry in given instance: log.Debugf("Searching for given pseudo gtid entry in %+v. monotonicPseudoGTIDEntries=%+v", instance.Key, monotonicPseudoGTIDEntries) currentBinlog := instance.SelfBinlogCoordinates var err error = nil for { log.Debugf("Searching for given pseudo gtid entry in binlog %+v of %+v", currentBinlog.LogFile, instance.Key) // loop iteration per binary log. This might turn to be a heavyweight operation. We wish to throttle the operation such that // the instance does not suffer. If it is a slave, we will only act as long as it's not lagging too much. if instance.SlaveRunning() { for { log.Debugf("%+v is a replicating slave. Verifying lag", instance.Key) instance, err = ReadTopologyInstance(&instance.Key) if err != nil { break } if instance.HasReasonableMaintenanceReplicationLag() { // is good to go! break } log.Debugf("lag is too high on %+v. Throttling the search for pseudo gtid entry", instance.Key) time.Sleep(time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds) * time.Second) } } var resultCoordinates BinlogCoordinates var found bool = false resultCoordinates, found, err = SearchEntryInBinlog(&instance.Key, currentBinlog.LogFile, entryText, monotonicPseudoGTIDEntries) if err != nil { break } if found { log.Debugf("Matched entry in %+v: %+v", instance.Key, resultCoordinates) instanceBinlogEntryCache.Set(cacheKey, &resultCoordinates, 0) return &resultCoordinates, nil } // Got here? Unfound. Keep looking currentBinlog, err = currentBinlog.PreviousFileCoordinates() if err != nil { break } log.Debugf("- Will move next to binlog %+v", currentBinlog.LogFile) } return nil, log.Errorf("Cannot match pseudo GTID entry in binlogs of %+v; err: %+v", instance.Key, err) }
// acceptSignals registers for OS signals func acceptSignals() { c := make(chan os.Signal, 1) signal.Notify(c, syscall.SIGHUP) go func() { for sig := range c { switch sig { case syscall.SIGHUP: log.Debugf("Received SIGHUP. Reloading configuration") config.Reload() inst.AuditOperation("reload-configuration", nil, "Triggered via SIGHUP") } } }() }
// StartSlaves will do concurrent start-slave func StartSlaves(slaves [](*Instance)) { // use concurrency but wait for all to complete log.Debugf("Starting %d slaves", len(slaves)) barrier := make(chan InstanceKey) for _, instance := range slaves { instance := instance go func() { // Signal compelted slave defer func() { barrier <- instance.Key }() // Wait your turn to read a slave ExecuteOnTopology(func() { StartSlave(&instance.Key) }) }() } for range slaves { <-barrier } }
func ApplyPoolInstances(pool string, instancesList string) error { var instanceKeys [](*InstanceKey) if instancesList != "" { instancesStrings := strings.Split(instancesList, ",") for _, instanceString := range instancesStrings { instanceKey, err := ParseInstanceKeyLoose(instanceString) log.Debugf("%+v", instanceKey) if err != nil { return log.Errore(err) } instanceKeys = append(instanceKeys, instanceKey) } } writePoolInstances(pool, instanceKeys) return nil }
// discoverInstance will attempt discovering an instance (unless it is already up to date) and will // list down its master and slaves (if any) for further discovery. func discoverInstance(instanceKey inst.InstanceKey) { instanceKey.Formalize() if !instanceKey.IsValid() { return } if existsInCacheError := recentDiscoveryOperationKeys.Add(instanceKey.DisplayString(), true, cache.DefaultExpiration); existsInCacheError != nil { // Just recently attempted return } instance, found, err := inst.ReadInstance(&instanceKey) if found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! return } discoveriesCounter.Inc(1) // First we've ever heard of this instance. Continue investigation: instance, err = inst.ReadTopologyInstance(&instanceKey) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it. if instance == nil { failedDiscoveriesCounter.Inc(1) log.Warningf("instance is nil in discoverInstance. key=%+v, error=%+v", instanceKey, err) return } log.Debugf("Discovered host: %+v, master: %+v, version: %+v", instance.Key, instance.MasterKey, instance.Version) if !isElectedNode { // Maybe this node was elected before, but isn't elected anymore. // If not elected, stop drilling up/down the topology return } // Investigate slaves: for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() { slaveKey := slaveKey discoveryInstanceKeys <- slaveKey } // Investigate master: discoveryInstanceKeys <- instance.MasterKey }
// readInternalDeployments reads orchestrator db deployment statements that are known to have been executed func readInternalDeployments() (baseDeployments []string, patchDeployments []string, err error) { if !config.Config.SmartOrchestratorDatabaseUpdate { return baseDeployments, patchDeployments, nil } query := fmt.Sprintf(` select deployment_type, sql_statement from _orchestrator_db_deployment order by deployment_id `) db, err := OpenOrchestrator() if err != nil { log.Fatalf("Cannot initiate orchestrator internal deployment: %+v", err) } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { deploymentType := m.GetString("deployment_type") sqlStatement := m.GetString("sql_statement") if deploymentType == "base" { baseDeployments = append(baseDeployments, sqlStatement) } else if deploymentType == "patch" { patchDeployments = append(patchDeployments, sqlStatement) } else { log.Fatalf("Unknown deployment type (%+v) encountered in _orchestrator_db_deployment", deploymentType) } return nil }) if err != nil { log.Debugf("Deploying internal orchestrator tables to fix the above; this is a one time operation") // Table does not exist? Create it for first time for _, query := range internalDBDeploymentSQL { if _, err = execInternal(db, query); err != nil { log.Fatalf("Cannot initiate orchestrator internal deployment: %+v", err) } } } return baseDeployments, patchDeployments, nil }
// RefreshTopologyInstances will do a blocking (though concurrent) refresh of all given instances func RefreshTopologyInstances(instances [](*Instance)) { // use concurrency but wait for all to complete barrier := make(chan InstanceKey) for _, instance := range instances { instance := instance go func() { // Signal completed slave defer func() { barrier <- instance.Key }() // Wait your turn to read a slave ExecuteOnTopology(func() { log.Debugf("... reading instance: %+v", instance.Key) ReadTopologyInstance(&instance.Key) }) }() } for range instances { <-barrier } }
// SkipToNextBinaryLog changes master position to beginning of next binlog // USE WITH CARE! // Use case is binlog servers where the master was gone & replaced by another. func SkipToNextBinaryLog(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } nextFileCoordinates, err := instance.ExecBinlogCoordinates.NextFileCoordinates() if err != nil { return instance, log.Errore(err) } nextFileCoordinates.LogPos = 4 log.Debugf("Will skip replication on %+v to next binary log: %+v", instance.Key, nextFileCoordinates.LogFile) instance, err = ChangeMasterTo(&instance.Key, &instance.MasterKey, &nextFileCoordinates, false, GTIDHintNeutral) if err != nil { return instance, log.Errore(err) } AuditOperation("skip-binlog", instanceKey, fmt.Sprintf("Skipped replication to next binary log: %+v", nextFileCoordinates.LogFile)) return StartSlave(instanceKey) }
// ContinuousAgentsPoll starts an asynchronuous infinite process where agents are // periodically investigated and their status captured, and long since unseen agents are // purged and forgotten. func ContinuousAgentsPoll() { log.Infof("Starting continuous agents poll") go discoverSeededAgents() tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) caretakingTick := time.Tick(time.Hour) for range tick { agentsHosts, _ := agent.ReadOutdatedAgentsHosts() log.Debugf("outdated agents hosts: %+v", agentsHosts) for _, hostname := range agentsHosts { go pollAgent(hostname) } // See if we should also forget agents (lower frequency) select { case <-caretakingTick: agent.ForgetLongUnseenAgents() agent.FailStaleSeeds() default: } } }