func getLastPseudoGTIDEntryInRelayLogs(instance *Instance, minBinlogCoordinates *BinlogCoordinates, recordedInstanceRelayLogCoordinates BinlogCoordinates, exhaustiveSearch bool) (*BinlogCoordinates, string, error) { // Look for last GTID in relay logs: // Since MySQL does not provide with a SHOW RELAY LOGS command, we heuristically start from current // relay log (indiciated by Relay_log_file) and walk backwards. // Eventually we will hit a relay log name which does not exist. pseudoGTIDRegexp, err := compilePseudoGTIDPattern() if err != nil { return nil, "", err } currentRelayLog := recordedInstanceRelayLogCoordinates err = nil for err == nil { log.Debugf("Searching for latest pseudo gtid entry in relaylog %+v of %+v, up to pos %+v", currentRelayLog.LogFile, instance.Key, recordedInstanceRelayLogCoordinates) if resultCoordinates, entryInfo, err := getLastPseudoGTIDEntryInBinlog(pseudoGTIDRegexp, &instance.Key, currentRelayLog.LogFile, RelayLog, minBinlogCoordinates, &recordedInstanceRelayLogCoordinates); err != nil { return nil, "", err } else if resultCoordinates != nil { log.Debugf("Found pseudo gtid entry in %+v, %+v", instance.Key, resultCoordinates) return resultCoordinates, entryInfo, err } if !exhaustiveSearch { break } if minBinlogCoordinates != nil && minBinlogCoordinates.LogFile == currentRelayLog.LogFile { // We tried and failed with the minBinlogCoordinates hint. We no longer require it, // and continue with exhaustive search. minBinlogCoordinates = nil log.Debugf("Heuristic relaylog search failed; continuing exhaustive search") // And we do NOT iterate to previous log file: we scan same log faile again, with no heuristic } else { currentRelayLog, err = currentRelayLog.PreviousFileCoordinates() } } return nil, "", log.Errorf("Cannot find pseudo GTID entry in relay logs of %+v", instance.Key) }
// Return the next chunk of binlog events; skip to next binary log file if need be; return empty result only // if reached end of binary logs func getNextBinlogEventsChunk(instance *Instance, startingCoordinates BinlogCoordinates, numEmptyBinlogs int) ([]BinlogEvent, error) { if numEmptyBinlogs > maxEmptyBinlogFiles { log.Debugf("Reached maxEmptyBinlogFiles (%d) at %+v", maxEmptyBinlogFiles, startingCoordinates) // Give up and return empty results return []BinlogEvent{}, nil } coordinatesExceededCurrent := false switch startingCoordinates.Type { case BinaryLog: coordinatesExceededCurrent = instance.SelfBinlogCoordinates.FileSmallerThan(&startingCoordinates) case RelayLog: coordinatesExceededCurrent = instance.RelaylogCoordinates.FileSmallerThan(&startingCoordinates) } if coordinatesExceededCurrent { // We're past the last file. This is a non-error: there are no more events. log.Debugf("Coordinates overflow: %+v; terminating search", startingCoordinates) return []BinlogEvent{}, nil } events, err := readBinlogEventsChunk(&instance.Key, startingCoordinates) if err != nil { return events, err } if len(events) > 0 { log.Debugf("Returning %d events at %+v", len(events), startingCoordinates) return events, nil } // events are empty if nextCoordinates, err := instance.GetNextBinaryLog(startingCoordinates); err == nil { log.Debugf("Recursing into %+v", nextCoordinates) return getNextBinlogEventsChunk(instance, nextCoordinates, numEmptyBinlogs+1) } // on error return events, err }
func GetMasterConnectionConfigSafe(connectionConfig *ConnectionConfig, visitedKeys *InstanceKeyMap, allowMasterMaster bool) (masterConfig *ConnectionConfig, err error) { log.Debugf("Looking for master on %+v", connectionConfig.Key) masterKey, err := GetMasterKeyFromSlaveStatus(connectionConfig) if err != nil { return nil, err } if masterKey == nil { return connectionConfig, nil } if !masterKey.IsValid() { return connectionConfig, nil } masterConfig = connectionConfig.Duplicate() masterConfig.Key = *masterKey log.Debugf("Master of %+v is %+v", connectionConfig.Key, masterConfig.Key) if visitedKeys.HasKey(masterConfig.Key) { if allowMasterMaster { return connectionConfig, nil } return nil, fmt.Errorf("There seems to be a master-master setup at %+v. This is unsupported. Bailing out", masterConfig.Key) } visitedKeys.AddKey(masterConfig.Key) return GetMasterConnectionConfigSafe(masterConfig, visitedKeys, allowMasterMaster) }
// OpenTopology returns the DB instance for the orchestrator backed database func OpenOrchestrator() (*sql.DB, error) { if config.Config.DatabaselessMode__experimental { return nil, nil } mysql_uri := fmt.Sprintf("%s:%s@tcp(%s:%d)/%s?timeout=%ds&readTimeout=%ds", config.Config.MySQLOrchestratorUser, config.Config.MySQLOrchestratorPassword, config.Config.MySQLOrchestratorHost, config.Config.MySQLOrchestratorPort, config.Config.MySQLOrchestratorDatabase, config.Config.MySQLConnectTimeoutSeconds, config.Config.MySQLOrchestratorReadTimeoutSeconds, ) if config.Config.MySQLOrchestratorUseMutualTLS { mysql_uri, _ = SetupMySQLOrchestratorTLS(mysql_uri) } db, fromCache, err := sqlutils.GetDB(mysql_uri) if err == nil && !fromCache { initOrchestratorDB(db) // do not show the password but do show what we connect to. safe_mysql_uri := fmt.Sprintf("%s:?@tcp(%s:%d)/%s?timeout=%ds", config.Config.MySQLOrchestratorUser, config.Config.MySQLOrchestratorHost, config.Config.MySQLOrchestratorPort, config.Config.MySQLOrchestratorDatabase, config.Config.MySQLConnectTimeoutSeconds) log.Debugf("Connected to orchestrator backend: %v", safe_mysql_uri) if config.Config.MySQLOrchestratorMaxPoolConnections > 0 { log.Debugf("Orchestrator pool SetMaxOpenConns: %d", config.Config.MySQLOrchestratorMaxPoolConnections) db.SetMaxOpenConns(config.Config.MySQLOrchestratorMaxPoolConnections) } db.SetMaxIdleConns(10) } return db, err }
// checkAndRecoverDeadMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. func checkAndRecoverDeadMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedMasterRecovery) { return false, nil, nil } topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) if topologyRecovery == nil { log.Debugf("topology_recovery: found an active or recent recovery on %+v. Will not issue another RecoverDeadMaster.", analysisEntry.AnalyzedInstanceKey) return false, nil, err } // That's it! We must do recovery! log.Debugf("topology_recovery: will handle DeadMaster event on %+v", analysisEntry.ClusterDetails.ClusterName) recoverDeadMasterCounter.Inc(1) promotedSlave, lostSlaves, err := RecoverDeadMaster(topologyRecovery, skipProcesses) topologyRecovery.LostSlaves.AddInstances(lostSlaves) if promotedSlave != nil { promotedSlave, err = replacePromotedSlaveWithCandidate(&analysisEntry.AnalyzedInstanceKey, promotedSlave, candidateInstanceKey) topologyRecovery.AddError(err) } // And this is the end; whether successful or not, we're done. ResolveRecovery(topologyRecovery, promotedSlave) if promotedSlave != nil { // Success! recoverDeadMasterSuccessCounter.Inc(1) if !skipProcesses { // Execute post master-failover processes executeProcesses(config.Config.PostMasterFailoverProcesses, "PostMasterFailoverProcesses", topologyRecovery, false) } } else { recoverDeadMasterFailureCounter.Inc(1) } return true, topologyRecovery, err }
// deployIfNotAlreadyDeployed will issue given sql queries that are not already known to be deployed. // This iterates both lists (to-run and already-deployed) and also verifies no contraditions. func deployIfNotAlreadyDeployed(db *sql.DB, queries []string, deployedQueries []string, deploymentType string, fatalOnError bool) error { tx, err := db.Begin() if err != nil { log.Fatale(err) } // Ugly workaround ahead. // Origin of this workaround is the existence of some "timestamp NOT NULL," column definitions, // where in NO_ZERO_IN_DATE,NO_ZERO_DATE sql_mode are invalid (since default is implicitly "0") // This means installation of orchestrator fails on such configured servers, and in particular on 5.7 // where this setting is the dfault. // For purpose of backwards compatability, what we do is force sql_mode to be more relaxed, create the schemas // along with the "invalid" definition, and then go ahead and fix those definitions via following ALTER statements. // My bad. originalSqlMode := "" err = tx.QueryRow(`select @@session.sql_mode`).Scan(&originalSqlMode) if _, err := tx.Exec(`set @@session.sql_mode=REPLACE(@@session.sql_mode, 'NO_ZERO_DATE', '')`); err != nil { log.Fatale(err) } if _, err := tx.Exec(`set @@session.sql_mode=REPLACE(@@session.sql_mode, 'NO_ZERO_IN_DATE', '')`); err != nil { log.Fatale(err) } for i, query := range queries { queryAlreadyExecuted := false // While iterating 'queries', also iterate 'deployedQueries'. Expect identity if len(deployedQueries) > i { if deployedQueries[i] != query { log.Fatalf("initOrchestratorDB() PANIC: non matching %s queries between deployment requests and _orchestrator_db_deployment. Please execute 'orchestrator -c reset-internal-db-deployment'", deploymentType) } queryAlreadyExecuted = true } if queryAlreadyExecuted { continue } if i == 0 { log.Debugf("sql_mode is: %+v", originalSqlMode) } if config.Config.SmartOrchestratorDatabaseUpdate { log.Debugf("initOrchestratorDB executing: %.80s", strings.TrimSpace(strings.Replace(query, "\n", "", -1))) } if fatalOnError { if _, err := tx.Exec(query); err != nil { return log.Fatalf("Cannot initiate orchestrator: %+v", err) } } else { tx.Exec(query) // And ignore any error } writeInternalDeployment(db, deploymentType, query, i) } if _, err := tx.Exec(`set session sql_mode=?`, originalSqlMode); err != nil { log.Fatale(err) } if err := tx.Commit(); err != nil { log.Fatale(err) } return nil }
func compilePseudoGTIDPattern() (pseudoGTIDRegexp *regexp.Regexp, err error) { log.Debugf("PseudoGTIDPatternIsFixedSubstring: %+v", config.Config.PseudoGTIDPatternIsFixedSubstring) if config.Config.PseudoGTIDPatternIsFixedSubstring { return nil, nil } log.Debugf("Compiling PseudoGTIDPattern") return regexp.Compile(config.Config.PseudoGTIDPattern) }
func postReadAdjustments() { if Config.MySQLOrchestratorCredentialsConfigFile != "" { mySQLConfig := struct { Client struct { User string Password string } }{} err := gcfg.ReadFileInto(&mySQLConfig, Config.MySQLOrchestratorCredentialsConfigFile) if err != nil { log.Fatalf("Failed to parse gcfg data from file: %+v", err) } else { log.Debugf("Parsed orchestrator credentials from %s", Config.MySQLOrchestratorCredentialsConfigFile) Config.MySQLOrchestratorUser = mySQLConfig.Client.User Config.MySQLOrchestratorPassword = mySQLConfig.Client.Password } } { // We accept password in the form "${SOME_ENV_VARIABLE}" in which case we pull // the given variable from os env submatch := envVariableRegexp.FindStringSubmatch(Config.MySQLOrchestratorPassword) if len(submatch) > 1 { Config.MySQLOrchestratorPassword = os.Getenv(submatch[1]) } } if Config.MySQLTopologyCredentialsConfigFile != "" { mySQLConfig := struct { Client struct { User string Password string } }{} err := gcfg.ReadFileInto(&mySQLConfig, Config.MySQLTopologyCredentialsConfigFile) if err != nil { log.Fatalf("Failed to parse gcfg data from file: %+v", err) } else { log.Debugf("Parsed topology credentials from %s", Config.MySQLTopologyCredentialsConfigFile) Config.MySQLTopologyUser = mySQLConfig.Client.User Config.MySQLTopologyPassword = mySQLConfig.Client.Password } } { // We accept password in the form "${SOME_ENV_VARIABLE}" in which case we pull // the given variable from os env submatch := envVariableRegexp.FindStringSubmatch(Config.MySQLTopologyPassword) if len(submatch) > 1 { Config.MySQLTopologyPassword = os.Getenv(submatch[1]) } } if Config.RecoveryPeriodBlockSeconds == 0 && Config.RecoveryPeriodBlockMinutes > 0 { // RecoveryPeriodBlockSeconds is a newer addition that overrides RecoveryPeriodBlockMinutes // The code does not consider RecoveryPeriodBlockMinutes anymore, but RecoveryPeriodBlockMinutes // still supported in config file for backwards compatibility Config.RecoveryPeriodBlockSeconds = Config.RecoveryPeriodBlockMinutes * 60 } }
// executeCheckAndRecoverFunction will choose the correct check & recovery function based on analysis. // It executes the function synchronuously func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, skipFilters bool, skipProcesses bool) (bool, *inst.Instance, error) { var checkAndRecoverFunction func(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, skipFilters bool, skipProcesses bool) (bool, *inst.Instance, error) = nil switch analysisEntry.Analysis { case inst.DeadMaster: checkAndRecoverFunction = checkAndRecoverDeadMaster case inst.DeadMasterAndSomeSlaves: checkAndRecoverFunction = checkAndRecoverDeadMaster case inst.DeadIntermediateMaster: checkAndRecoverFunction = checkAndRecoverDeadIntermediateMaster case inst.DeadIntermediateMasterAndSomeSlaves: checkAndRecoverFunction = checkAndRecoverDeadIntermediateMaster case inst.DeadIntermediateMasterWithSingleSlaveFailingToConnect: checkAndRecoverFunction = checkAndRecoverDeadIntermediateMaster case inst.AllIntermediateMasterSlavesFailingToConnectOrDead: checkAndRecoverFunction = checkAndRecoverDeadIntermediateMaster case inst.DeadCoMaster: checkAndRecoverFunction = checkAndRecoverDeadIntermediateMaster case inst.DeadMasterAndSlaves: go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceMasterKey, analysisEntry.Analysis) case inst.UnreachableMaster: go emergentlyReadTopologyInstanceSlaves(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) case inst.AllMasterSlavesNotReplicating: //checkAndRecoverFunction = checkAndRecoverGenericProblem go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) case inst.FirstTierSlaveFailingToConnectToMaster: go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceMasterKey, analysisEntry.Analysis) } if checkAndRecoverFunction == nil { // Unhandled problem type return false, nil, nil } // we have a recovery function; its execution still depends on filters if not disabled. log.Debugf("executeCheckAndRecoverFunction: proceeeding with %+v; skipProcesses: %+v", analysisEntry.AnalyzedInstanceKey, skipProcesses) if ok, _ := AttemptFailureDetectionRegistration(&analysisEntry); ok { log.Debugf("topology_recovery: detected %+v failure on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) // Execute on-detection processes if !skipProcesses { if err := executeProcesses(config.Config.OnFailureDetectionProcesses, "OnFailureDetectionProcesses", analysisEntry, nil, emptySlavesList, true); err != nil { return false, nil, err } } } actionTaken, promotedSlave, err := checkAndRecoverFunction(analysisEntry, candidateInstanceKey, skipFilters, skipProcesses) if actionTaken { if !skipProcesses { // Execute post intermediate-master-failover processes executeProcesses(config.Config.PostFailoverProcesses, "PostFailoverProcesses", analysisEntry, promotedSlave, emptySlavesList, false) } } return actionTaken, promotedSlave, err }
// Attempt to resolve a hostname. This may return a database cached hostname or otherwise // it may resolve the hostname via CNAME func ResolveHostname(hostname string) (string, error) { hostname = strings.TrimSpace(hostname) if hostname == "" { return hostname, errors.New("Will not resolve empty hostname") } if strings.Contains(hostname, ",") { return hostname, fmt.Errorf("Will not resolve multi-hostname: %+v", hostname) } if (&InstanceKey{Hostname: hostname}).IsDetached() { // quietly abort. Nothign to do. The hostname is detached for a reason: it // will not be resolved, for sure. return hostname, nil } // First go to lightweight cache if resolvedHostname, found := hostnameResolvesLightweightCache.Get(hostname); found { return resolvedHostname.(string), nil } if !hostnameResolvesLightweightCacheLoadedOnceFromDB { // A continuous-discovery will first make sure to load all resolves from DB. // However cli does not do so. // Anyway, it seems like the cache was not loaded from DB. Before doing real resolves, // let's try and get the resolved hostname from database. if !HostnameResolveMethodIsNone() { if resolvedHostname, err := ReadResolvedHostname(hostname); err == nil && resolvedHostname != "" { hostnameResolvesLightweightCache.Set(hostname, resolvedHostname, 0) return resolvedHostname, nil } } } // Unfound: resolve! log.Debugf("Hostname unresolved yet: %s", hostname) resolvedHostname, err := resolveHostname(hostname) if config.Config.RejectHostnameResolvePattern != "" { // Reject, don't even cache if matched, _ := regexp.MatchString(config.Config.RejectHostnameResolvePattern, resolvedHostname); matched { log.Warningf("ResolveHostname: %+v resolved to %+v but rejected due to RejectHostnameResolvePattern '%+v'", hostname, resolvedHostname, config.Config.RejectHostnameResolvePattern) return hostname, nil } } if err != nil { // Problem. What we'll do is cache the hostname for just one minute, so as to avoid flooding requests // on one hand, yet make it refresh shortly on the other hand. Anyway do not write to database. hostnameResolvesLightweightCache.Set(hostname, resolvedHostname, time.Minute) return hostname, err } // Good result! Cache it, also to DB log.Debugf("Cache hostname resolve %s as %s", hostname, resolvedHostname) UpdateResolvedHostname(hostname, resolvedHostname) return resolvedHostname, nil }
// executeWriteFuncs writes data via applier: both the rowcopy and the events backlog. // This is where the ghost table gets the data. The function fills the data single-threaded. // Both event backlog and rowcopy events are polled; the backlog events have precedence. func (this *Migrator) executeWriteFuncs() error { if this.migrationContext.Noop { log.Debugf("Noop operation; not really executing write funcs") return nil } for { if atomic.LoadInt64(&this.inCutOverCriticalActionFlag) == 0 { // we don't throttle when cutting over. We _do_ throttle: // - during copy phase // - just before cut-over // - in between cut-over retries this.throttle(nil) // When cutting over, we need to be aggressive. Cut-over holds table locks. // We need to release those asap. } // We give higher priority to event processing, then secondary priority to // rowcopy select { case applyEventFunc := <-this.applyEventsQueue: { if err := this.retryOperation(applyEventFunc); err != nil { return log.Errore(err) } } default: { select { case copyRowsFunc := <-this.copyRowsQueue: { copyRowsStartTime := time.Now() // Retries are handled within the copyRowsFunc if err := copyRowsFunc(); err != nil { return log.Errore(err) } if niceRatio := this.migrationContext.GetNiceRatio(); niceRatio > 0 { copyRowsDuration := time.Now().Sub(copyRowsStartTime) sleepTimeNanosecondFloat64 := niceRatio * float64(copyRowsDuration.Nanoseconds()) sleepTime := time.Duration(time.Duration(int64(sleepTimeNanosecondFloat64)) * time.Nanosecond) time.Sleep(sleepTime) } } default: { // Hmmmmm... nothing in the queue; no events, but also no row copy. // This is possible upon load. Let's just sleep it over. log.Debugf("Getting nothing in the write queue. Sleeping...") time.Sleep(time.Second) } } } } } return nil }
func execCmd(commandText string, arguments ...string) (*exec.Cmd, string, error) { commandBytes := []byte(commandText) tmpFile, err := ioutil.TempFile("", "gh-ost-process-cmd-") if err != nil { return nil, "", log.Errore(err) } ioutil.WriteFile(tmpFile.Name(), commandBytes, 0644) log.Debugf("execCmd: %s", commandText) shellArguments := append([]string{}, tmpFile.Name()) shellArguments = append(shellArguments, arguments...) log.Debugf("%+v", shellArguments) return exec.Command("bash", shellArguments...), tmpFile.Name(), nil }
// checkAndRecoverDeadIntermediateMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. func checkAndRecoverDeadIntermediateMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedIntermediateMasterRecovery) { return false, nil, nil } topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) if topologyRecovery == nil { log.Debugf("topology_recovery: found an active or recent recovery on %+v. Will not issue another RecoverDeadIntermediateMaster.", analysisEntry.AnalyzedInstanceKey) return false, nil, err } // That's it! We must do recovery! recoverDeadIntermediateMasterCounter.Inc(1) promotedSlave, err := RecoverDeadIntermediateMaster(topologyRecovery, skipProcesses) if promotedSlave != nil { // success recoverDeadIntermediateMasterSuccessCounter.Inc(1) if !skipProcesses { // Execute post intermediate-master-failover processes topologyRecovery.SuccessorKey = &promotedSlave.Key executeProcesses(config.Config.PostIntermediateMasterFailoverProcesses, "PostIntermediateMasterFailoverProcesses", topologyRecovery, false) } } else { recoverDeadIntermediateMasterFailureCounter.Inc(1) } return true, topologyRecovery, err }
// SkipQuery skip a single query in a failed replication instance func SkipQuery(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, fmt.Errorf("instance is not a slave: %+v", instanceKey) } if instance.Slave_SQL_Running { return instance, fmt.Errorf("Slave SQL thread is running on %+v", instanceKey) } if instance.LastSQLError == "" { return instance, fmt.Errorf("No SQL error on %+v", instanceKey) } if *config.RuntimeCLIFlags.Noop { return instance, fmt.Errorf("noop: aborting skip-query operation on %+v; signalling error but nothing went wrong.", *instanceKey) } log.Debugf("Skipping one query on %+v", instanceKey) if instance.UsingOracleGTID { err = skipQueryOracleGtid(instance) } else if instance.UsingMariaDBGTID { return instance, log.Errorf("%+v is replicating with MariaDB GTID. To skip a query first disable GTID, then skip, then enable GTID again", *instanceKey) } else { err = skipQueryClassic(instance) } if err != nil { return instance, log.Errore(err) } AuditOperation("skip-query", instanceKey, "Skipped one query") return StartSlave(instanceKey) }
// ChangeMasterCredentials issues a CHANGE MASTER TO... MASTER_USER=, MASTER_PASSWORD=... func ChangeMasterCredentials(instanceKey *InstanceKey, masterUser string, masterPassword string) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if masterUser == "" { return instance, log.Errorf("Empty user in ChangeMasterCredentials() for %+v", *instanceKey) } if instance.SlaveRunning() { return instance, fmt.Errorf("ChangeMasterTo: Cannot change master on: %+v because slave is running", *instanceKey) } log.Debugf("ChangeMasterTo: will attempt changing master credentials on %+v", *instanceKey) if *config.RuntimeCLIFlags.Noop { return instance, fmt.Errorf("noop: aborting CHANGE MASTER TO operation on %+v; signalling error but nothing went wrong.", *instanceKey) } _, err = ExecInstanceNoPrepare(instanceKey, fmt.Sprintf("change master to master_user='******', master_password='******'", masterUser, masterPassword)) if err != nil { return instance, log.Errore(err) } log.Infof("ChangeMasterTo: Changed master credentials on %+v", *instanceKey) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// StopSlavesNicely will attemt to stop all given slaves nicely, up to timeout func StopSlavesNicely(slaves [](*Instance), timeout time.Duration) [](*Instance) { refreshedSlaves := [](*Instance){} log.Debugf("Stopping %d slaves nicely", len(slaves)) // use concurrency but wait for all to complete barrier := make(chan *Instance) for _, slave := range slaves { slave := slave go func() { updatedSlave := &slave // Signal completed slave defer func() { barrier <- *updatedSlave }() // Wait your turn to read a slave ExecuteOnTopology(func() { StopSlaveNicely(&slave.Key, timeout) slave, _ = StopSlave(&slave.Key) updatedSlave = &slave }) }() } for range slaves { refreshedSlaves = append(refreshedSlaves, <-barrier) } return refreshedSlaves }
// ApplyIterationInsertQuery issues a chunk-INSERT query on the ghost table. It is where // data actually gets copied from original table. func (this *Applier) ApplyIterationInsertQuery() (chunkSize int64, rowsAffected int64, duration time.Duration, err error) { startTime := time.Now() chunkSize = atomic.LoadInt64(&this.migrationContext.ChunkSize) query, explodedArgs, err := sql.BuildRangeInsertPreparedQuery( this.migrationContext.DatabaseName, this.migrationContext.OriginalTableName, this.migrationContext.GetGhostTableName(), this.migrationContext.SharedColumns.Names, this.migrationContext.MappedSharedColumns.Names, this.migrationContext.UniqueKey.Name, this.migrationContext.UniqueKey.Columns.Names, this.migrationContext.MigrationIterationRangeMinValues.AbstractValues(), this.migrationContext.MigrationIterationRangeMaxValues.AbstractValues(), this.migrationContext.GetIteration() == 0, this.migrationContext.IsTransactionalTable(), ) if err != nil { return chunkSize, rowsAffected, duration, err } sqlResult, err := sqlutils.Exec(this.db, query, explodedArgs...) if err != nil { return chunkSize, rowsAffected, duration, err } rowsAffected, _ = sqlResult.RowsAffected() duration = time.Now().Sub(startTime) log.Debugf( "Issued INSERT on range: [%s]..[%s]; iteration: %d; chunk-size: %d", this.migrationContext.MigrationIterationRangeMinValues, this.migrationContext.MigrationIterationRangeMaxValues, this.migrationContext.GetIteration(), chunkSize) return chunkSize, rowsAffected, duration, nil }
// StopSlavesNicely will attemt to stop all given replicas nicely, up to timeout func StopSlavesNicely(replicas [](*Instance), timeout time.Duration) [](*Instance) { refreshedReplicas := [](*Instance){} log.Debugf("Stopping %d replicas nicely", len(replicas)) // use concurrency but wait for all to complete barrier := make(chan *Instance) for _, replica := range replicas { replica := replica go func() { updatedReplica := &replica // Signal completed replica defer func() { barrier <- *updatedReplica }() // Wait your turn to read a replica ExecuteOnTopology(func() { StopSlaveNicely(&replica.Key, timeout) replica, _ = StopSlave(&replica.Key) updatedReplica = &replica }) }() } for range replicas { refreshedReplicas = append(refreshedReplicas, <-barrier) } return refreshedReplicas }
// validateTableForeignKeys makes sure no foreign keys exist on the migrated table func (this *Inspector) validateTableForeignKeys() error { query := ` SELECT COUNT(*) AS num_foreign_keys FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE WHERE REFERENCED_TABLE_NAME IS NOT NULL AND ((TABLE_SCHEMA=? AND TABLE_NAME=?) OR (REFERENCED_TABLE_SCHEMA=? AND REFERENCED_TABLE_NAME=?) ) ` numForeignKeys := 0 err := sqlutils.QueryRowsMap(this.db, query, func(rowMap sqlutils.RowMap) error { numForeignKeys = rowMap.GetInt("num_foreign_keys") return nil }, this.migrationContext.DatabaseName, this.migrationContext.OriginalTableName, this.migrationContext.DatabaseName, this.migrationContext.OriginalTableName, ) if err != nil { return err } if numForeignKeys > 0 { return log.Errorf("Found %d foreign keys on %s.%s. Foreign keys are not supported. Bailing out", numForeignKeys, sql.EscapeName(this.migrationContext.DatabaseName), sql.EscapeName(this.migrationContext.OriginalTableName)) } log.Debugf("Validated no foreign keys exist on table") return nil }
// NextRealEvent returns the next event from binlog that is not meta/control event (these are start-of-binary-log, // rotate-binary-log etc.) func (this *BinlogEventCursor) nextRealEvent(recursionLevel int) (*BinlogEvent, error) { if recursionLevel > maxEmptyEventsEvents { log.Debugf("End of real events") return nil, nil } event, err := this.nextEvent(0) if err != nil { return event, err } if event == nil { return event, err } if _, found := skippedEventTypes[event.EventType]; found { // Recursion will not be deep here. A few entries (end-of-binlog followed by start-of-bin-log) are possible, // but we really don't expect a huge sequence of those. return this.nextRealEvent(recursionLevel + 1) } for _, skipSubstring := range config.Config.SkipBinlogEventsContaining { if strings.Index(event.Info, skipSubstring) >= 0 { // Recursion might go deeper here. return this.nextRealEvent(recursionLevel + 1) } } event.NormalizeInfo() return event, err }
// executeAgentCommand requests an agent to execute a command via HTTP api func executeAgentCommand(hostname string, command string, onResponse *func([]byte)) (Agent, error) { agent, token, err := readAgentBasicInfo(hostname) if err != nil { return agent, err } // All seems to be in order. Now make some inquiries from orchestrator-agent service: uri := baseAgentUri(agent.Hostname, agent.Port) var fullCommand string if strings.Contains(command, "?") { fullCommand = fmt.Sprintf("%s&token=%s", command, token) } else { fullCommand = fmt.Sprintf("%s?token=%s", command, token) } log.Debugf("orchestrator-agent command: %s", fullCommand) agentCommandUri := fmt.Sprintf("%s/%s", uri, fullCommand) body, err := readResponse(httpGet(agentCommandUri)) if err != nil { return agent, log.Errore(err) } if onResponse != nil { (*onResponse)(body) } auditAgentOperation("agent-command", &agent, command) return agent, err }
// checkAndRecoverDeadCoMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { failedInstanceKey := &analysisEntry.AnalyzedInstanceKey if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedMasterRecovery) { return false, nil, nil } topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) if topologyRecovery == nil { log.Debugf("topology_recovery: found an active or recent recovery on %+v. Will not issue another RecoverDeadCoMaster.", analysisEntry.AnalyzedInstanceKey) return false, nil, err } // That's it! We must do recovery! recoverDeadCoMasterCounter.Inc(1) coMaster, lostSlaves, err := RecoverDeadCoMaster(topologyRecovery, skipProcesses) ResolveRecovery(topologyRecovery, coMaster) if coMaster == nil { inst.AuditOperation("recover-dead-co-master", failedInstanceKey, "Failure: no slave promoted.") } else { inst.AuditOperation("recover-dead-co-master", failedInstanceKey, fmt.Sprintf("promoted co-master: %+v", coMaster.Key)) } topologyRecovery.LostSlaves.AddInstances(lostSlaves) if coMaster != nil { // success recoverDeadCoMasterSuccessCounter.Inc(1) if !skipProcesses { // Execute post intermediate-master-failover processes topologyRecovery.SuccessorKey = &coMaster.Key executeProcesses(config.Config.PostMasterFailoverProcesses, "PostMasterFailoverProcesses", topologyRecovery, false) } } else { recoverDeadCoMasterFailureCounter.Inc(1) } return true, topologyRecovery, err }
// CheckAndRecover is the main entry point for the recovery mechanism func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedSlaveKey *inst.InstanceKey, err error) { replicationAnalysis, err := inst.GetReplicationAnalysis("", true, true) if err != nil { return false, nil, log.Errore(err) } if *config.RuntimeCLIFlags.Noop { log.Debugf("--noop provided; will not execute processes") skipProcesses = true } for _, analysisEntry := range replicationAnalysis { if specificInstance != nil { // We are looking for a specific instance; if this is not the one, skip! if !specificInstance.Equals(&analysisEntry.AnalyzedInstanceKey) { continue } } if analysisEntry.IsDowntimed && specificInstance == nil { // Only recover a downtimed server if explicitly requested continue } if specificInstance != nil { // force mode. Keep it synchronuous var topologyRecovery *TopologyRecovery recoveryAttempted, topologyRecovery, err = executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) if topologyRecovery != nil { promotedSlaveKey = topologyRecovery.SuccessorKey } } else { go executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, false, skipProcesses) } } return recoveryAttempted, promotedSlaveKey, err }
// onChangelogStateEvent is called when a binlog event operation on the changelog table is intercepted. func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) { // Hey, I created the changlog table, I know the type of columns it has! if hint := dmlEvent.NewColumnValues.StringColumn(2); hint != "state" { return nil } changelogState := ChangelogState(dmlEvent.NewColumnValues.StringColumn(3)) switch changelogState { case TablesInPlace: { this.tablesInPlace <- true } case AllEventsUpToLockProcessed: { applyEventFunc := func() error { this.allEventsUpToLockProcessed <- true return nil } // at this point we know all events up to lock have been read from the streamer, // because the streamer works sequentially. So those events are either already handled, // or have event functions in applyEventsQueue. // So as not to create a potential deadlock, we write this func to applyEventsQueue // asynchronously, understanding it doesn't really matter. go func() { this.applyEventsQueue <- applyEventFunc }() } default: { return fmt.Errorf("Unknown changelog state: %+v", changelogState) } } log.Debugf("Received state %+v", changelogState) return nil }
// validateTable makes sure the table we need to operate on actually exists func (this *Inspector) validateTable() error { query := fmt.Sprintf(`show /* gh-ost */ table status from %s like '%s'`, sql.EscapeName(this.migrationContext.DatabaseName), this.migrationContext.OriginalTableName) tableFound := false err := sqlutils.QueryRowsMap(this.db, query, func(rowMap sqlutils.RowMap) error { this.migrationContext.TableEngine = rowMap.GetString("Engine") this.migrationContext.RowsEstimate = rowMap.GetInt64("Rows") this.migrationContext.UsedRowsEstimateMethod = base.TableStatusRowsEstimate if rowMap.GetString("Comment") == "VIEW" { return fmt.Errorf("%s.%s is a VIEW, not a real table. Bailing out", sql.EscapeName(this.migrationContext.DatabaseName), sql.EscapeName(this.migrationContext.OriginalTableName)) } tableFound = true return nil }) if err != nil { return err } if !tableFound { return log.Errorf("Cannot find table %s.%s!", sql.EscapeName(this.migrationContext.DatabaseName), sql.EscapeName(this.migrationContext.OriginalTableName)) } log.Infof("Table found. Engine=%s", this.migrationContext.TableEngine) log.Debugf("Estimated number of rows via STATUS: %d", this.migrationContext.RowsEstimate) return nil }
// checkAndRecoverDeadMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. func checkAndRecoverDeadMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, skipFilters bool, skipProcesses bool) (bool, *inst.Instance, error) { if !(skipFilters || analysisEntry.ClusterDetails.HasAutomatedMasterRecovery) { return false, nil, nil } // Let's do dead master recovery! log.Debugf("topology_recovery: will handle DeadMaster event on %+v", analysisEntry.ClusterDetails.ClusterName) promotedSlave, lostSlaves, err := RecoverDeadMaster(analysisEntry, skipProcesses) if promotedSlave != nil { promotedSlave, _ = replacePromotedSlaveWithCandidate(&analysisEntry.AnalyzedInstanceKey, promotedSlave, candidateInstanceKey) } if promotedSlave != nil { ResolveRecovery(&analysisEntry.AnalyzedInstanceKey, &promotedSlave.Key) if !skipProcesses { // Execute post master-failover processes executeProcesses(config.Config.PostMasterFailoverProcesses, "PostMasterFailoverProcesses", analysisEntry, promotedSlave, lostSlaves, false) } } else { // Failure ResolveRecovery(&analysisEntry.AnalyzedInstanceKey, nil) } return (promotedSlave != nil), promotedSlave, err }
// deployIfNotAlreadyDeployed will issue given sql queries that are not already known to be deployed. // This iterates both lists (to-run and already-deployed) and also verifies no contraditions. func deployIfNotAlreadyDeployed(db *sql.DB, queries []string, deployedQueries []string, deploymentType string, fatalOnError bool) error { for i, query := range queries { queryAlreadyExecuted := false // While iterating 'queries', also iterate 'deployedQueries'. Expect identity if len(deployedQueries) > i { if deployedQueries[i] != query { log.Fatalf("initOrchestratorDB() PANIC: non matching %s queries between deployment requests and _orchestrator_db_deployment. Please execute 'orchestrator -c reset-internal-db-deployment'", deploymentType) } queryAlreadyExecuted = true } if queryAlreadyExecuted { continue } if config.Config.SmartOrchestratorDatabaseUpdate { log.Debugf("initOrchestratorDB executing: %.80s", strings.TrimSpace(strings.Replace(query, "\n", "", -1))) } if fatalOnError { if _, err := execInternal(db, query); err != nil { return log.Fatalf("Cannot initiate orchestrator: %+v", err) } } else { execInternalSilently(db, query) } writeInternalDeployment(db, deploymentType, query, i) } return nil }
func InitGraphiteMetrics() error { if config.Config.GraphiteAddr == "" { return nil } if config.Config.GraphitePath == "" { return log.Errorf("No graphite path provided (see GraphitePath config variable). Will not log to graphite") } addr, err := net.ResolveTCPAddr("tcp", config.Config.GraphiteAddr) if err != nil { return log.Errore(err) } graphitePathHostname := process.ThisHostname if config.Config.GraphiteConvertHostnameDotsToUnderscores { graphitePathHostname = strings.Replace(graphitePathHostname, ".", "_", -1) } graphitePath := config.Config.GraphitePath graphitePath = strings.Replace(graphitePath, "{hostname}", graphitePathHostname, -1) log.Debugf("Will log to graphite on %+v, %+v", config.Config.GraphiteAddr, graphitePath) go func() { go graphite.Graphite(metrics.DefaultRegistry, 1*time.Minute, graphitePath, addr) for range graphiteCallbackTick { for _, f := range graphiteTickCallbacks { go f() } } }() return nil }
func SubmitAgent() error { hostname, err := osagent.Hostname() if err != nil { return log.Errore(err) } url := fmt.Sprintf("%s/api/submit-agent/%s/%d/%s", config.Config.AgentsServer, hostname, config.Config.HTTPPort, ProcessToken.Hash) log.Debugf("Submitting this agent: %s", url) response, err := httpGet(url) if err != nil { return log.Errore(err) } log.Debugf("response: %+v", response) return err }
// read reads configuration from given file, or silently skips if the file does not exist. // If the file does exist, then it is expected to be in valid JSON format or the function bails out. func read(file_name string) (*Configuration, error) { file, err := os.Open(file_name) if err == nil { decoder := json.NewDecoder(file) err := decoder.Decode(Config) if err == nil { log.Infof("Read config: %s", file_name) } else { log.Fatal("Cannot read config file:", file_name, err) } if Config.MySQLOrchestratorCredentialsConfigFile != "" { mySQLConfig := struct { Client struct { User string Password string } }{} err := gcfg.ReadFileInto(&mySQLConfig, Config.MySQLOrchestratorCredentialsConfigFile) if err != nil { log.Fatalf("Failed to parse gcfg data from file: %+v", err) } else { log.Debugf("Parsed orchestrator credentials from %s", Config.MySQLOrchestratorCredentialsConfigFile) Config.MySQLOrchestratorUser = mySQLConfig.Client.User Config.MySQLOrchestratorPassword = mySQLConfig.Client.Password } } if Config.MySQLTopologyCredentialsConfigFile != "" { mySQLConfig := struct { Client struct { User string Password string } }{} err := gcfg.ReadFileInto(&mySQLConfig, Config.MySQLTopologyCredentialsConfigFile) if err != nil { log.Fatalf("Failed to parse gcfg data from file: %+v", err) } else { log.Debugf("Parsed topology credentials from %s", Config.MySQLTopologyCredentialsConfigFile) Config.MySQLTopologyUser = mySQLConfig.Client.User Config.MySQLTopologyPassword = mySQLConfig.Client.Password } } } return Config, err }