Example #1
0
// ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are
// periodically investigated and their status captured, and long since unseen instances are
// purged and forgotten.
func ContinuousDiscovery() {
	if config.Config.DatabaselessMode__experimental {
		log.Fatal("Cannot execute continuous mode in databaseless mode")
	}

	log.Infof("Starting continuous discovery")
	recentDiscoveryOperationKeys = cache.New(time.Duration(config.Config.InstancePollSeconds)*time.Second, time.Second)

	inst.LoadHostnameResolveCache()
	go handleDiscoveryRequests()

	discoveryTick := time.Tick(time.Duration(config.Config.GetDiscoveryPollSeconds()) * time.Second)
	instancePollTick := time.Tick(time.Duration(config.Config.InstancePollSeconds) * time.Second)
	caretakingTick := time.Tick(time.Minute)
	recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second)
	var snapshotTopologiesTick <-chan time.Time
	if config.Config.SnapshotTopologiesIntervalHours > 0 {
		snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour)
	}

	go ometrics.InitGraphiteMetrics()
	go acceptSignals()

	if *config.RuntimeCLIFlags.GrabElection {
		process.GrabElection()
	}
	for {
		select {
		case <-discoveryTick:
			go func() {
				wasAlreadyElected := atomic.LoadInt64(&isElectedNode)
				myIsElectedNode, err := process.AttemptElection()
				if err != nil {
					log.Errore(err)
				}
				if myIsElectedNode {
					atomic.StoreInt64(&isElectedNode, 1)
				} else {
					atomic.StoreInt64(&isElectedNode, 0)
				}

				if myIsElectedNode {
					instanceKeys, err := inst.ReadOutdatedInstanceKeys()
					if err != nil {
						log.Errore(err)
					}

					log.Debugf("outdated keys: %+v", instanceKeys)
					for _, instanceKey := range instanceKeys {
						instanceKey := instanceKey

						if instanceKey.IsValid() {
							discoveryQueue.Push(instanceKey)
						}
					}
					if wasAlreadyElected == 0 {
						// Just turned to be leader!
						go process.RegisterNode("", "", false)
					}
				} else {
					log.Debugf("Not elected as active node; polling")
				}
			}()
		case <-instancePollTick:
			go func() {
				// This tick does NOT do instance poll (these are handled by the oversmapling discoveryTick)
				// But rather should invoke such routinely operations that need to be as (or roughly as) frequent
				// as instance poll
				if atomic.LoadInt64(&isElectedNode) == 1 {
					go inst.UpdateInstanceRecentRelaylogHistory()
					go inst.RecordInstanceCoordinatesHistory()
				}
			}()
		case <-caretakingTick:
			// Various periodic internal maintenance tasks
			go func() {
				if atomic.LoadInt64(&isElectedNode) == 1 {
					go inst.RecordInstanceBinlogFileHistory()
					go inst.ForgetLongUnseenInstances()
					go inst.ForgetUnseenInstancesDifferentlyResolved()
					go inst.ForgetExpiredHostnameResolves()
					go inst.DeleteInvalidHostnameResolves()
					go inst.ReviewUnseenInstances()
					go inst.InjectUnseenMasters()
					go inst.ResolveUnknownMasterHostnameResolves()
					go inst.UpdateClusterAliases()
					go inst.ExpireMaintenance()
					go inst.ExpireDowntime()
					go inst.ExpireCandidateInstances()
					go inst.ExpireHostnameUnresolve()
					go inst.ExpireClusterDomainName()
					go inst.ExpireAudit()
					go inst.ExpireMasterPositionEquivalence()
					go inst.ExpirePoolInstances()
					go inst.FlushNontrivialResolveCacheToDatabase()
					go process.ExpireNodesHistory()
					go process.ExpireAccessTokens()
				} else {
					// Take this opportunity to refresh yourself
					go inst.LoadHostnameResolveCache()
				}
			}()
		case <-recoveryTick:
			go func() {
				if atomic.LoadInt64(&isElectedNode) == 1 {
					go ClearActiveFailureDetections()
					go ClearActiveRecoveries()
					go ExpireBlockedRecoveries()
					go AcknowledgeCrashedRecoveries()
					go inst.ExpireInstanceAnalysisChangelog()
					go CheckAndRecover(nil, nil, false)
				}
			}()
		case <-snapshotTopologiesTick:
			go func() {
				go inst.SnapshotTopologies()
			}()
		}
	}
}
Example #2
0
// ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are
// periodically investigated and their status captured, and long since unseen instances are
// purged and forgotten.
func ContinuousDiscovery() {
	if config.Config.DatabaselessMode__experimental {
		log.Fatal("Cannot execute continuous mode in databaseless mode")
	}

	log.Infof("Starting continuous discovery")
	inst.LoadHostnameResolveCacheFromDatabase()
	go handleDiscoveryRequests()

	discoveryTick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second)
	caretakingTick := time.Tick(time.Minute)
	recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second)
	var snapshotTopologiesTick <-chan time.Time
	if config.Config.SnapshotTopologiesIntervalHours > 0 {
		snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour)
	}

	go ometrics.InitGraphiteMetrics()
	go acceptSignals()

	for {
		select {
		case <-discoveryTick:
			go func() {
				wasAlreadyElected := isElectedNode
				if isElectedNode, _ = process.AttemptElection(); isElectedNode {
					instanceKeys, _ := inst.ReadOutdatedInstanceKeys()
					log.Debugf("outdated keys: %+v", instanceKeys)
					for _, instanceKey := range instanceKeys {
						go func() { discoveryInstanceKeys <- instanceKey }()
					}
					if !wasAlreadyElected {
						// Just turned to be leader!
						go process.RegisterNode("", "", false)
					}
				} else {
					log.Debugf("Not elected as active node; polling")
				}
			}()
		case <-caretakingTick:
			// Various periodic internal maintenance tasks
			go func() {
				if isElectedNode {
					go inst.ForgetLongUnseenInstances()
					go inst.ForgetUnseenInstancesDifferentlyResolved()
					go inst.ForgetExpiredHostnameResolves()
					go inst.DeleteInvalidHostnameResolves()
					go inst.ReviewUnseenInstances()
					go inst.InjectUnseenMasters()
					go inst.ResolveUnknownMasterHostnameResolves()
					go inst.UpdateClusterAliases()
					go inst.ExpireMaintenance()
					go inst.ExpireDowntime()
					go inst.ExpireCandidateInstances()
					go inst.ExpireHostnameUnresolve()
					go inst.ExpireClusterDomainName()
					go inst.ExpireAudit()
					go inst.ExpireMasterPositionEquivalence()
					go inst.FlushNontrivialResolveCacheToDatabase()
					go process.ExpireNodesHistory()
				}
				if !isElectedNode {
					// Take this opportunity to refresh yourself
					go inst.LoadHostnameResolveCacheFromDatabase()
				}
				go inst.ReadClusterAliases()
			}()
		case <-recoveryTick:
			go func() {
				if isElectedNode {
					go ClearActiveFailureDetections()
					go ClearActiveRecoveries()
					go ExpireBlockedRecoveries()
					go AcknowledgeCrashedRecoveries()
					go inst.ExpireInstanceAnalysisChangelog()
					go CheckAndRecover(nil, nil, false)
				}
			}()
		case <-snapshotTopologiesTick:
			go func() {
				go inst.SnapshotTopologies()
			}()
		}
	}
}
Example #3
0
// DiscoverInstance will attempt discovering an instance (unless it is already up to date) and will
// list down its master and slaves (if any) for further discovery.
func DiscoverInstance(instanceKey inst.InstanceKey) {
	instanceKey.Formalize()
	if !instanceKey.IsValid() {
		return
	}

	instance, found, err := inst.ReadInstance(&instanceKey)

	if found && instance.IsUpToDate && instance.IsLastCheckValid {
		// we've already discovered this one. Skip!
		goto Cleanup
	}
	// First we've ever heard of this instance. Continue investigation:
	instance, err = inst.ReadTopologyInstance(&instanceKey)
	// panic can occur (IO stuff). Therefore it may happen
	// that instance is nil. Check it.
	if err != nil || instance == nil {
		log.Warningf("instance is nil in DiscoverInstance. key=%+v, error=%+v", instanceKey, err)
		goto Cleanup
	}

	log.Debugf("Discovered host: %+v, master: %+v", instance.Key, instance.MasterKey)

	// Investigate slaves:
	for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() {
		discoveryInstanceKeys <- slaveKey
	}
	// Investigate master:
	discoveryInstanceKeys <- instance.MasterKey

Cleanup:
}

// Start discovery begins a one time asynchronuous discovery process for the given
// instance and all of its topology connected instances.
// That is, the instance will be investigated for master and slaves, and the routines will follow on
// each and every such found master/slave.
// In essense, assuming all slaves in a replication topology are running, and given a single instance
// in such topology, this function will detect the entire topology.
func StartDiscovery(instanceKey inst.InstanceKey) {
	log.Infof("Starting discovery at %+v", instanceKey)
	pendingTokens := make(chan bool, maxConcurrency)
	completedTokens := make(chan bool, maxConcurrency)

	AccountedDiscoverInstance(instanceKey, pendingTokens, completedTokens)
	go handleDiscoveryRequests(pendingTokens, completedTokens)

	// Block until all are complete
	for {
		select {
		case <-pendingTokens:
			<-completedTokens
		default:
			inst.AuditOperation("start-discovery", &instanceKey, "")
			return
		}
	}
}

// ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are
// periodically investigated and their status captured, and long since unseen instances are
// purged and forgotten.
func ContinuousDiscovery() {
	log.Infof("Starting continuous discovery")
	inst.LoadHostnameResolveCacheFromDatabase()
	go handleDiscoveryRequests(nil, nil)
	tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second)
	forgetUnseenTick := time.Tick(time.Minute)
	recoverTick := time.Tick(10 * time.Second)

	var snapshotTopologiesTick <-chan time.Time
	if config.Config.SnapshotTopologiesIntervalHours > 0 {
		snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour)
	}

	elected := false
	_ = CreateElectionAnchor(false)
	for {
		select {
		case <-tick:
			if elected, _ = AttemptElection(); elected {
				instanceKeys, _ := inst.ReadOutdatedInstanceKeys()
				log.Debugf("outdated keys: %+v", instanceKeys)
				for _, instanceKey := range instanceKeys {
					discoveryInstanceKeys <- instanceKey
				}
			} else {
				log.Debugf("Not elected as active node; polling")
			}
		case <-forgetUnseenTick:
			// See if we should also forget objects (lower frequency)
			go func() {
				if elected {
					inst.ForgetLongUnseenInstances()
					inst.ForgetUnseenInstancesDifferentlyResolved()
					inst.ForgetExpiredHostnameResolves()
					inst.DeleteInvalidHostnameResolves()
					inst.ReviewUnseenInstances()
					inst.InjectUnseenMasters()
					inst.ResolveUnknownMasterHostnameResolves()
					inst.ExpireMaintenance()
					inst.ExpireDowntime()
					inst.ExpireCandidateInstances()
					inst.ExpireHostnameUnresolve()
					inst.ExpireClusterDomainName()
				}
				if !elected {
					// Take this opportunity to refresh yourself
					inst.LoadHostnameResolveCacheFromDatabase()
				}
				inst.ReadClusterAliases()
				HealthTest()
			}()
		case <-recoverTick:
			go func() {
				if elected {
					ClearActiveFailureDetections()
					ClearActiveRecoveries()
					CheckAndRecover(nil, nil, false, false)
				}
			}()
		case <-snapshotTopologiesTick:
			go func() {
				inst.SnapshotTopologies()
			}()
		}
	}
}

func pollAgent(hostname string) error {
	polledAgent, err := agent.GetAgent(hostname)
	agent.UpdateAgentLastChecked(hostname)

	if err != nil {
		return log.Errore(err)
	}

	err = agent.UpdateAgentInfo(hostname, polledAgent)
	if err != nil {
		return log.Errore(err)
	}

	return nil
}

// ContinuousAgentsPoll starts an asynchronuous infinite process where agents are
// periodically investigated and their status captured, and long since unseen agents are
// purged and forgotten.
func ContinuousAgentsPoll() {
	log.Infof("Starting continuous agents poll")

	go discoverSeededAgents()

	tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second)
	forgetUnseenTick := time.Tick(time.Hour)
	for _ = range tick {
		agentsHosts, _ := agent.ReadOutdatedAgentsHosts()
		log.Debugf("outdated agents hosts: %+v", agentsHosts)
		for _, hostname := range agentsHosts {
			go pollAgent(hostname)
		}
		// See if we should also forget agents (lower frequency)
		select {
		case <-forgetUnseenTick:
			agent.ForgetLongUnseenAgents()
			agent.FailStaleSeeds()
		default:
		}
	}
}

func discoverSeededAgents() {
	for seededAgent := range agent.SeededAgents {
		instanceKey := inst.InstanceKey{Hostname: seededAgent.Hostname, Port: int(seededAgent.MySQLPort)}
		go StartDiscovery(instanceKey)
	}
}
Example #4
0
// discoverInstance will attempt discovering an instance (unless it is already up to date) and will
// list down its master and slaves (if any) for further discovery.
func discoverInstance(instanceKey inst.InstanceKey) {
	instanceKey.Formalize()
	if !instanceKey.IsValid() {
		return
	}

	instance, found, err := inst.ReadInstance(&instanceKey)

	if found && instance.IsUpToDate && instance.IsLastCheckValid {
		// we've already discovered this one. Skip!
		goto Cleanup
	}
	discoveriesCounter.Inc(1)
	// First we've ever heard of this instance. Continue investigation:
	instance, err = inst.ReadTopologyInstance(&instanceKey)
	// panic can occur (IO stuff). Therefore it may happen
	// that instance is nil. Check it.
	if err != nil || instance == nil {
		failedDiscoveriesCounter.Inc(1)
		log.Warningf("instance is nil in discoverInstance. key=%+v, error=%+v", instanceKey, err)
		goto Cleanup
	}

	log.Debugf("Discovered host: %+v, master: %+v", instance.Key, instance.MasterKey)

	if !isElectedNode {
		// Maybe this node was elected before, but isn't elected anymore.
		// If not elected, stop drilling down to further investigate slaves.
		return
	}

	// Investigate slaves:
	for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() {
		discoveryInstanceKeys <- slaveKey
	}
	// Investigate master:
	discoveryInstanceKeys <- instance.MasterKey

Cleanup:
}

// Start discovery begins a one time asynchronuous discovery process for the given
// instance and all of its topology connected instances.
// That is, the instance will be investigated for master and slaves, and the routines will follow on
// each and every such found master/slave.
// In essense, assuming all slaves in a replication topology are running, and given a single instance
// in such topology, this function will detect the entire topology.
func StartDiscovery(instanceKey inst.InstanceKey) {
	log.Infof("Starting discovery at %+v", instanceKey)
	pendingTokens := make(chan bool, maxConcurrency)
	completedTokens := make(chan bool, maxConcurrency)

	accountedDiscoverInstance(instanceKey, pendingTokens, completedTokens)
	go handleDiscoveryRequests(pendingTokens, completedTokens)

	// Block until all are complete
	for {
		select {
		case <-pendingTokens:
			<-completedTokens
		default:
			inst.AuditOperation("start-discovery", &instanceKey, "")
			return
		}
	}
}

func initGraphiteMetrics() error {
	if config.Config.GraphiteAddr == "" {
		return nil
	}
	if config.Config.GraphitePath == "" {
		return log.Errorf("No graphite path provided (see GraphitePath config variable). Will not log to graphite")
	}
	addr, err := net.ResolveTCPAddr("tcp", config.Config.GraphiteAddr)
	if err != nil {
		return log.Errore(err)
	}
	graphitePathHostname := ThisHostname
	if config.Config.GraphiteConvertHostnameDotsToUnderscores {
		graphitePathHostname = strings.Replace(graphitePathHostname, ".", "_", -1)
	}
	graphitePath := config.Config.GraphitePath
	graphitePath = strings.Replace(graphitePath, "{hostname}", graphitePathHostname, -1)

	log.Debugf("Will log to graphite on %+v, %+v", config.Config.GraphiteAddr, graphitePath)
	go graphite.Graphite(metrics.DefaultRegistry, 1*time.Minute, graphitePath, addr)

	return nil

}

// ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are
// periodically investigated and their status captured, and long since unseen instances are
// purged and forgotten.
func ContinuousDiscovery() {
	log.Infof("Starting continuous discovery")
	inst.LoadHostnameResolveCacheFromDatabase()
	go handleDiscoveryRequests(nil, nil)
	tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second)
	forgetUnseenTick := time.Tick(time.Minute)
	recoverTick := time.Tick(10 * time.Second)

	var snapshotTopologiesTick <-chan time.Time
	if config.Config.SnapshotTopologiesIntervalHours > 0 {
		snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour)
	}

	go initGraphiteMetrics()

	for {
		select {
		case <-tick:
			go func() {
				if isElectedNode, _ = attemptElection(); isElectedNode {
					instanceKeys, _ := inst.ReadOutdatedInstanceKeys()
					log.Debugf("outdated keys: %+v", instanceKeys)
					for _, instanceKey := range instanceKeys {
						discoveryInstanceKeys <- instanceKey
					}
				} else {
					log.Debugf("Not elected as active node; polling")
				}
				discoveryQueueLengthGauge.Update(int64(len(discoveryInstanceKeys)))
			}()
		case <-forgetUnseenTick:
			// See if we should also forget objects (lower frequency)
			go func() {
				if isElectedNode {
					inst.ForgetLongUnseenInstances()
					inst.ForgetUnseenInstancesDifferentlyResolved()
					inst.ForgetExpiredHostnameResolves()
					inst.DeleteInvalidHostnameResolves()
					inst.ReviewUnseenInstances()
					inst.InjectUnseenMasters()
					inst.ResolveUnknownMasterHostnameResolves()
					inst.ExpireMaintenance()
					inst.ExpireDowntime()
					inst.ExpireCandidateInstances()
					inst.ExpireHostnameUnresolve()
					inst.ExpireClusterDomainName()
					inst.ExpireAudit()
					inst.ExpireMasterPositionEquivalence()
				}
				if !isElectedNode {
					// Take this opportunity to refresh yourself
					inst.LoadHostnameResolveCacheFromDatabase()
				}
				inst.ReadClusterAliases()
				HealthTest()
			}()
		case <-recoverTick:
			go func() {
				if isElectedNode {
					ClearActiveFailureDetections()
					ClearActiveRecoveries()
					CheckAndRecover(nil, nil, false, false)
				}
			}()
		case <-snapshotTopologiesTick:
			go func() {
				inst.SnapshotTopologies()
			}()
		}
	}
}

func pollAgent(hostname string) error {
	polledAgent, err := agent.GetAgent(hostname)
	agent.UpdateAgentLastChecked(hostname)

	if err != nil {
		return log.Errore(err)
	}

	err = agent.UpdateAgentInfo(hostname, polledAgent)
	if err != nil {
		return log.Errore(err)
	}

	return nil
}

// ContinuousAgentsPoll starts an asynchronuous infinite process where agents are
// periodically investigated and their status captured, and long since unseen agents are
// purged and forgotten.
func ContinuousAgentsPoll() {
	log.Infof("Starting continuous agents poll")

	go discoverSeededAgents()

	tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second)
	forgetUnseenTick := time.Tick(time.Hour)
	for range tick {
		agentsHosts, _ := agent.ReadOutdatedAgentsHosts()
		log.Debugf("outdated agents hosts: %+v", agentsHosts)
		for _, hostname := range agentsHosts {
			go pollAgent(hostname)
		}
		// See if we should also forget agents (lower frequency)
		select {
		case <-forgetUnseenTick:
			agent.ForgetLongUnseenAgents()
			agent.FailStaleSeeds()
		default:
		}
	}
}

func discoverSeededAgents() {
	for seededAgent := range agent.SeededAgents {
		instanceKey := inst.InstanceKey{Hostname: seededAgent.Hostname, Port: int(seededAgent.MySQLPort)}
		go StartDiscovery(instanceKey)
	}
}