// ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are // periodically investigated and their status captured, and long since unseen instances are // purged and forgotten. func ContinuousDiscovery() { if config.Config.DatabaselessMode__experimental { log.Fatal("Cannot execute continuous mode in databaseless mode") } log.Infof("Starting continuous discovery") recentDiscoveryOperationKeys = cache.New(time.Duration(config.Config.InstancePollSeconds)*time.Second, time.Second) inst.LoadHostnameResolveCache() go handleDiscoveryRequests() discoveryTick := time.Tick(time.Duration(config.Config.GetDiscoveryPollSeconds()) * time.Second) instancePollTick := time.Tick(time.Duration(config.Config.InstancePollSeconds) * time.Second) caretakingTick := time.Tick(time.Minute) recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) var snapshotTopologiesTick <-chan time.Time if config.Config.SnapshotTopologiesIntervalHours > 0 { snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) } go ometrics.InitGraphiteMetrics() go acceptSignals() if *config.RuntimeCLIFlags.GrabElection { process.GrabElection() } for { select { case <-discoveryTick: go func() { wasAlreadyElected := atomic.LoadInt64(&isElectedNode) myIsElectedNode, err := process.AttemptElection() if err != nil { log.Errore(err) } if myIsElectedNode { atomic.StoreInt64(&isElectedNode, 1) } else { atomic.StoreInt64(&isElectedNode, 0) } if myIsElectedNode { instanceKeys, err := inst.ReadOutdatedInstanceKeys() if err != nil { log.Errore(err) } log.Debugf("outdated keys: %+v", instanceKeys) for _, instanceKey := range instanceKeys { instanceKey := instanceKey if instanceKey.IsValid() { discoveryQueue.Push(instanceKey) } } if wasAlreadyElected == 0 { // Just turned to be leader! go process.RegisterNode("", "", false) } } else { log.Debugf("Not elected as active node; polling") } }() case <-instancePollTick: go func() { // This tick does NOT do instance poll (these are handled by the oversmapling discoveryTick) // But rather should invoke such routinely operations that need to be as (or roughly as) frequent // as instance poll if atomic.LoadInt64(&isElectedNode) == 1 { go inst.UpdateInstanceRecentRelaylogHistory() go inst.RecordInstanceCoordinatesHistory() } }() case <-caretakingTick: // Various periodic internal maintenance tasks go func() { if atomic.LoadInt64(&isElectedNode) == 1 { go inst.RecordInstanceBinlogFileHistory() go inst.ForgetLongUnseenInstances() go inst.ForgetUnseenInstancesDifferentlyResolved() go inst.ForgetExpiredHostnameResolves() go inst.DeleteInvalidHostnameResolves() go inst.ReviewUnseenInstances() go inst.InjectUnseenMasters() go inst.ResolveUnknownMasterHostnameResolves() go inst.UpdateClusterAliases() go inst.ExpireMaintenance() go inst.ExpireDowntime() go inst.ExpireCandidateInstances() go inst.ExpireHostnameUnresolve() go inst.ExpireClusterDomainName() go inst.ExpireAudit() go inst.ExpireMasterPositionEquivalence() go inst.ExpirePoolInstances() go inst.FlushNontrivialResolveCacheToDatabase() go process.ExpireNodesHistory() go process.ExpireAccessTokens() } else { // Take this opportunity to refresh yourself go inst.LoadHostnameResolveCache() } }() case <-recoveryTick: go func() { if atomic.LoadInt64(&isElectedNode) == 1 { go ClearActiveFailureDetections() go ClearActiveRecoveries() go ExpireBlockedRecoveries() go AcknowledgeCrashedRecoveries() go inst.ExpireInstanceAnalysisChangelog() go CheckAndRecover(nil, nil, false) } }() case <-snapshotTopologiesTick: go func() { go inst.SnapshotTopologies() }() } } }
// ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are // periodically investigated and their status captured, and long since unseen instances are // purged and forgotten. func ContinuousDiscovery() { if config.Config.DatabaselessMode__experimental { log.Fatal("Cannot execute continuous mode in databaseless mode") } log.Infof("Starting continuous discovery") inst.LoadHostnameResolveCacheFromDatabase() go handleDiscoveryRequests() discoveryTick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) caretakingTick := time.Tick(time.Minute) recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) var snapshotTopologiesTick <-chan time.Time if config.Config.SnapshotTopologiesIntervalHours > 0 { snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) } go ometrics.InitGraphiteMetrics() go acceptSignals() for { select { case <-discoveryTick: go func() { wasAlreadyElected := isElectedNode if isElectedNode, _ = process.AttemptElection(); isElectedNode { instanceKeys, _ := inst.ReadOutdatedInstanceKeys() log.Debugf("outdated keys: %+v", instanceKeys) for _, instanceKey := range instanceKeys { go func() { discoveryInstanceKeys <- instanceKey }() } if !wasAlreadyElected { // Just turned to be leader! go process.RegisterNode("", "", false) } } else { log.Debugf("Not elected as active node; polling") } }() case <-caretakingTick: // Various periodic internal maintenance tasks go func() { if isElectedNode { go inst.ForgetLongUnseenInstances() go inst.ForgetUnseenInstancesDifferentlyResolved() go inst.ForgetExpiredHostnameResolves() go inst.DeleteInvalidHostnameResolves() go inst.ReviewUnseenInstances() go inst.InjectUnseenMasters() go inst.ResolveUnknownMasterHostnameResolves() go inst.UpdateClusterAliases() go inst.ExpireMaintenance() go inst.ExpireDowntime() go inst.ExpireCandidateInstances() go inst.ExpireHostnameUnresolve() go inst.ExpireClusterDomainName() go inst.ExpireAudit() go inst.ExpireMasterPositionEquivalence() go inst.FlushNontrivialResolveCacheToDatabase() go process.ExpireNodesHistory() } if !isElectedNode { // Take this opportunity to refresh yourself go inst.LoadHostnameResolveCacheFromDatabase() } go inst.ReadClusterAliases() }() case <-recoveryTick: go func() { if isElectedNode { go ClearActiveFailureDetections() go ClearActiveRecoveries() go ExpireBlockedRecoveries() go AcknowledgeCrashedRecoveries() go inst.ExpireInstanceAnalysisChangelog() go CheckAndRecover(nil, nil, false) } }() case <-snapshotTopologiesTick: go func() { go inst.SnapshotTopologies() }() } } }
// DiscoverInstance will attempt discovering an instance (unless it is already up to date) and will // list down its master and slaves (if any) for further discovery. func DiscoverInstance(instanceKey inst.InstanceKey) { instanceKey.Formalize() if !instanceKey.IsValid() { return } instance, found, err := inst.ReadInstance(&instanceKey) if found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! goto Cleanup } // First we've ever heard of this instance. Continue investigation: instance, err = inst.ReadTopologyInstance(&instanceKey) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it. if err != nil || instance == nil { log.Warningf("instance is nil in DiscoverInstance. key=%+v, error=%+v", instanceKey, err) goto Cleanup } log.Debugf("Discovered host: %+v, master: %+v", instance.Key, instance.MasterKey) // Investigate slaves: for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() { discoveryInstanceKeys <- slaveKey } // Investigate master: discoveryInstanceKeys <- instance.MasterKey Cleanup: } // Start discovery begins a one time asynchronuous discovery process for the given // instance and all of its topology connected instances. // That is, the instance will be investigated for master and slaves, and the routines will follow on // each and every such found master/slave. // In essense, assuming all slaves in a replication topology are running, and given a single instance // in such topology, this function will detect the entire topology. func StartDiscovery(instanceKey inst.InstanceKey) { log.Infof("Starting discovery at %+v", instanceKey) pendingTokens := make(chan bool, maxConcurrency) completedTokens := make(chan bool, maxConcurrency) AccountedDiscoverInstance(instanceKey, pendingTokens, completedTokens) go handleDiscoveryRequests(pendingTokens, completedTokens) // Block until all are complete for { select { case <-pendingTokens: <-completedTokens default: inst.AuditOperation("start-discovery", &instanceKey, "") return } } } // ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are // periodically investigated and their status captured, and long since unseen instances are // purged and forgotten. func ContinuousDiscovery() { log.Infof("Starting continuous discovery") inst.LoadHostnameResolveCacheFromDatabase() go handleDiscoveryRequests(nil, nil) tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) forgetUnseenTick := time.Tick(time.Minute) recoverTick := time.Tick(10 * time.Second) var snapshotTopologiesTick <-chan time.Time if config.Config.SnapshotTopologiesIntervalHours > 0 { snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) } elected := false _ = CreateElectionAnchor(false) for { select { case <-tick: if elected, _ = AttemptElection(); elected { instanceKeys, _ := inst.ReadOutdatedInstanceKeys() log.Debugf("outdated keys: %+v", instanceKeys) for _, instanceKey := range instanceKeys { discoveryInstanceKeys <- instanceKey } } else { log.Debugf("Not elected as active node; polling") } case <-forgetUnseenTick: // See if we should also forget objects (lower frequency) go func() { if elected { inst.ForgetLongUnseenInstances() inst.ForgetUnseenInstancesDifferentlyResolved() inst.ForgetExpiredHostnameResolves() inst.DeleteInvalidHostnameResolves() inst.ReviewUnseenInstances() inst.InjectUnseenMasters() inst.ResolveUnknownMasterHostnameResolves() inst.ExpireMaintenance() inst.ExpireDowntime() inst.ExpireCandidateInstances() inst.ExpireHostnameUnresolve() inst.ExpireClusterDomainName() } if !elected { // Take this opportunity to refresh yourself inst.LoadHostnameResolveCacheFromDatabase() } inst.ReadClusterAliases() HealthTest() }() case <-recoverTick: go func() { if elected { ClearActiveFailureDetections() ClearActiveRecoveries() CheckAndRecover(nil, nil, false, false) } }() case <-snapshotTopologiesTick: go func() { inst.SnapshotTopologies() }() } } } func pollAgent(hostname string) error { polledAgent, err := agent.GetAgent(hostname) agent.UpdateAgentLastChecked(hostname) if err != nil { return log.Errore(err) } err = agent.UpdateAgentInfo(hostname, polledAgent) if err != nil { return log.Errore(err) } return nil } // ContinuousAgentsPoll starts an asynchronuous infinite process where agents are // periodically investigated and their status captured, and long since unseen agents are // purged and forgotten. func ContinuousAgentsPoll() { log.Infof("Starting continuous agents poll") go discoverSeededAgents() tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) forgetUnseenTick := time.Tick(time.Hour) for _ = range tick { agentsHosts, _ := agent.ReadOutdatedAgentsHosts() log.Debugf("outdated agents hosts: %+v", agentsHosts) for _, hostname := range agentsHosts { go pollAgent(hostname) } // See if we should also forget agents (lower frequency) select { case <-forgetUnseenTick: agent.ForgetLongUnseenAgents() agent.FailStaleSeeds() default: } } } func discoverSeededAgents() { for seededAgent := range agent.SeededAgents { instanceKey := inst.InstanceKey{Hostname: seededAgent.Hostname, Port: int(seededAgent.MySQLPort)} go StartDiscovery(instanceKey) } }
// discoverInstance will attempt discovering an instance (unless it is already up to date) and will // list down its master and slaves (if any) for further discovery. func discoverInstance(instanceKey inst.InstanceKey) { instanceKey.Formalize() if !instanceKey.IsValid() { return } instance, found, err := inst.ReadInstance(&instanceKey) if found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! goto Cleanup } discoveriesCounter.Inc(1) // First we've ever heard of this instance. Continue investigation: instance, err = inst.ReadTopologyInstance(&instanceKey) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it. if err != nil || instance == nil { failedDiscoveriesCounter.Inc(1) log.Warningf("instance is nil in discoverInstance. key=%+v, error=%+v", instanceKey, err) goto Cleanup } log.Debugf("Discovered host: %+v, master: %+v", instance.Key, instance.MasterKey) if !isElectedNode { // Maybe this node was elected before, but isn't elected anymore. // If not elected, stop drilling down to further investigate slaves. return } // Investigate slaves: for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() { discoveryInstanceKeys <- slaveKey } // Investigate master: discoveryInstanceKeys <- instance.MasterKey Cleanup: } // Start discovery begins a one time asynchronuous discovery process for the given // instance and all of its topology connected instances. // That is, the instance will be investigated for master and slaves, and the routines will follow on // each and every such found master/slave. // In essense, assuming all slaves in a replication topology are running, and given a single instance // in such topology, this function will detect the entire topology. func StartDiscovery(instanceKey inst.InstanceKey) { log.Infof("Starting discovery at %+v", instanceKey) pendingTokens := make(chan bool, maxConcurrency) completedTokens := make(chan bool, maxConcurrency) accountedDiscoverInstance(instanceKey, pendingTokens, completedTokens) go handleDiscoveryRequests(pendingTokens, completedTokens) // Block until all are complete for { select { case <-pendingTokens: <-completedTokens default: inst.AuditOperation("start-discovery", &instanceKey, "") return } } } func initGraphiteMetrics() error { if config.Config.GraphiteAddr == "" { return nil } if config.Config.GraphitePath == "" { return log.Errorf("No graphite path provided (see GraphitePath config variable). Will not log to graphite") } addr, err := net.ResolveTCPAddr("tcp", config.Config.GraphiteAddr) if err != nil { return log.Errore(err) } graphitePathHostname := ThisHostname if config.Config.GraphiteConvertHostnameDotsToUnderscores { graphitePathHostname = strings.Replace(graphitePathHostname, ".", "_", -1) } graphitePath := config.Config.GraphitePath graphitePath = strings.Replace(graphitePath, "{hostname}", graphitePathHostname, -1) log.Debugf("Will log to graphite on %+v, %+v", config.Config.GraphiteAddr, graphitePath) go graphite.Graphite(metrics.DefaultRegistry, 1*time.Minute, graphitePath, addr) return nil } // ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are // periodically investigated and their status captured, and long since unseen instances are // purged and forgotten. func ContinuousDiscovery() { log.Infof("Starting continuous discovery") inst.LoadHostnameResolveCacheFromDatabase() go handleDiscoveryRequests(nil, nil) tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) forgetUnseenTick := time.Tick(time.Minute) recoverTick := time.Tick(10 * time.Second) var snapshotTopologiesTick <-chan time.Time if config.Config.SnapshotTopologiesIntervalHours > 0 { snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) } go initGraphiteMetrics() for { select { case <-tick: go func() { if isElectedNode, _ = attemptElection(); isElectedNode { instanceKeys, _ := inst.ReadOutdatedInstanceKeys() log.Debugf("outdated keys: %+v", instanceKeys) for _, instanceKey := range instanceKeys { discoveryInstanceKeys <- instanceKey } } else { log.Debugf("Not elected as active node; polling") } discoveryQueueLengthGauge.Update(int64(len(discoveryInstanceKeys))) }() case <-forgetUnseenTick: // See if we should also forget objects (lower frequency) go func() { if isElectedNode { inst.ForgetLongUnseenInstances() inst.ForgetUnseenInstancesDifferentlyResolved() inst.ForgetExpiredHostnameResolves() inst.DeleteInvalidHostnameResolves() inst.ReviewUnseenInstances() inst.InjectUnseenMasters() inst.ResolveUnknownMasterHostnameResolves() inst.ExpireMaintenance() inst.ExpireDowntime() inst.ExpireCandidateInstances() inst.ExpireHostnameUnresolve() inst.ExpireClusterDomainName() inst.ExpireAudit() inst.ExpireMasterPositionEquivalence() } if !isElectedNode { // Take this opportunity to refresh yourself inst.LoadHostnameResolveCacheFromDatabase() } inst.ReadClusterAliases() HealthTest() }() case <-recoverTick: go func() { if isElectedNode { ClearActiveFailureDetections() ClearActiveRecoveries() CheckAndRecover(nil, nil, false, false) } }() case <-snapshotTopologiesTick: go func() { inst.SnapshotTopologies() }() } } } func pollAgent(hostname string) error { polledAgent, err := agent.GetAgent(hostname) agent.UpdateAgentLastChecked(hostname) if err != nil { return log.Errore(err) } err = agent.UpdateAgentInfo(hostname, polledAgent) if err != nil { return log.Errore(err) } return nil } // ContinuousAgentsPoll starts an asynchronuous infinite process where agents are // periodically investigated and their status captured, and long since unseen agents are // purged and forgotten. func ContinuousAgentsPoll() { log.Infof("Starting continuous agents poll") go discoverSeededAgents() tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) forgetUnseenTick := time.Tick(time.Hour) for range tick { agentsHosts, _ := agent.ReadOutdatedAgentsHosts() log.Debugf("outdated agents hosts: %+v", agentsHosts) for _, hostname := range agentsHosts { go pollAgent(hostname) } // See if we should also forget agents (lower frequency) select { case <-forgetUnseenTick: agent.ForgetLongUnseenAgents() agent.FailStaleSeeds() default: } } } func discoverSeededAgents() { for seededAgent := range agent.SeededAgents { instanceKey := inst.InstanceKey{Hostname: seededAgent.Hostname, Port: int(seededAgent.MySQLPort)} go StartDiscovery(instanceKey) } }