// CheckAndRecover is the main entry point for the recovery mechanism func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipFilters bool, skipProcesses bool) (actionTaken bool, instance *inst.Instance, err error) { replicationAnalysis, err := inst.GetReplicationAnalysis(true) if err != nil { return false, nil, log.Errore(err) } for _, analysisEntry := range replicationAnalysis { if specificInstance != nil { // We are looking for a specific instance; if this is not the one, skip! if !specificInstance.Equals(&analysisEntry.AnalyzedInstanceKey) { continue } } if analysisEntry.IsDowntimed && specificInstance == nil { // Only recover a downtimed server if explicitly requested continue } if specificInstance != nil && skipFilters { // force mode. Keep it synchronuous actionTaken, instance, err = executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, skipFilters, skipProcesses) } else { go executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, skipFilters, skipProcesses) } } return actionTaken, instance, err }
// CheckAndRecover is the main entry point for the recovery mechanism func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedSlaveKey *inst.InstanceKey, err error) { replicationAnalysis, err := inst.GetReplicationAnalysis("", true, true) if err != nil { return false, nil, log.Errore(err) } if *config.RuntimeCLIFlags.Noop { log.Debugf("--noop provided; will not execute processes") skipProcesses = true } for _, analysisEntry := range replicationAnalysis { if specificInstance != nil { // We are looking for a specific instance; if this is not the one, skip! if !specificInstance.Equals(&analysisEntry.AnalyzedInstanceKey) { continue } } if analysisEntry.IsDowntimed && specificInstance == nil { // Only recover a downtimed server if explicitly requested continue } if specificInstance != nil { // force mode. Keep it synchronuous var topologyRecovery *TopologyRecovery recoveryAttempted, topologyRecovery, err = executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) if topologyRecovery != nil { promotedSlaveKey = topologyRecovery.SuccessorKey } } else { go executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, false, skipProcesses) } } return recoveryAttempted, promotedSlaveKey, err }
// Force a re-read of a topology instance; this is done because we need to substantiate a suspicion that we may have a failover // scenario. we want to speed up rading the complete picture. func emergentlyReadTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { if err := emergencyReadTopologyInstanceMap.Add(instanceKey.DisplayString(), true, 0); err == nil { emergencyReadTopologyInstanceMap.Set(instanceKey.DisplayString(), true, 0) go inst.ExecuteOnTopology(func() { inst.ReadTopologyInstance(instanceKey) inst.AuditOperation("emergently-read-topology-instance", instanceKey, string(analysisCode)) }) } }
// Force a re-read of a topology instance; this is done because we need to substantiate a suspicion that we may have a failover // scenario. we want to speed up reading the complete picture. func emergentlyReadTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { if existsInCacheError := emergencyReadTopologyInstanceMap.Add(instanceKey.DisplayString(), true, cache.DefaultExpiration); existsInCacheError != nil { // Just recently attempted return } go inst.ExecuteOnTopology(func() { inst.ReadTopologyInstance(instanceKey) inst.AuditOperation("emergently-read-topology-instance", instanceKey, string(analysisCode)) }) }
// discoverInstance will attempt discovering an instance (unless it is already up to date) and will // list down its master and slaves (if any) for further discovery. func discoverInstance(instanceKey inst.InstanceKey) { start := time.Now() instanceKey.Formalize() if !instanceKey.IsValid() { return } if existsInCacheError := recentDiscoveryOperationKeys.Add(instanceKey.DisplayString(), true, cache.DefaultExpiration); existsInCacheError != nil { // Just recently attempted return } instance, found, err := inst.ReadInstance(&instanceKey) if found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! return } discoveriesCounter.Inc(1) // First we've ever heard of this instance. Continue investigation: instance, err = inst.ReadTopologyInstance(&instanceKey) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it. if instance == nil { failedDiscoveriesCounter.Inc(1) log.Warningf("discoverInstance(%+v) instance is nil in %.3fs, error=%+v", instanceKey, time.Since(start).Seconds(), err) return } log.Debugf("Discovered host: %+v, master: %+v, version: %+v in %.3fs", instance.Key, instance.MasterKey, instance.Version, time.Since(start).Seconds()) if atomic.LoadInt64(&isElectedNode) == 0 { // Maybe this node was elected before, but isn't elected anymore. // If not elected, stop drilling up/down the topology return } // Investigate slaves: for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() { slaveKey := slaveKey if slaveKey.IsValid() { discoveryQueue.Push(slaveKey) } } // Investigate master: if instance.MasterKey.IsValid() { discoveryQueue.Push(instance.MasterKey) } }
// discoverInstance will attempt discovering an instance (unless it is already up to date) and will // list down its master and slaves (if any) for further discovery. func discoverInstance(instanceKey inst.InstanceKey) { instanceKey.Formalize() if !instanceKey.IsValid() { return } if existsInCacheError := recentDiscoveryOperationKeys.Add(instanceKey.DisplayString(), true, cache.DefaultExpiration); existsInCacheError != nil { // Just recently attempted return } instance, found, err := inst.ReadInstance(&instanceKey) if found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! return } discoveriesCounter.Inc(1) // First we've ever heard of this instance. Continue investigation: instance, err = inst.ReadTopologyInstance(&instanceKey) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it. if instance == nil { failedDiscoveriesCounter.Inc(1) log.Warningf("instance is nil in discoverInstance. key=%+v, error=%+v", instanceKey, err) return } log.Debugf("Discovered host: %+v, master: %+v", instance.Key, instance.MasterKey) if !isElectedNode { // Maybe this node was elected before, but isn't elected anymore. // If not elected, stop drilling down to further investigate slaves. return } // Investigate slaves: for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() { discoveryInstanceKeys <- slaveKey } // Investigate master: discoveryInstanceKeys <- instance.MasterKey }