func addHooks(preparerConfig *PreparerConfig, logger logging.Logger) { for _, dest := range preparerConfig.ExtraLogDestinations { logger.WithFields(logrus.Fields{ "type": dest.Type, "path": dest.Path, }).Infoln("Adding log destination") if err := logger.AddHook(dest.Type, dest.Path); err != nil { logger.WithError(err).Errorf("Unable to add log hook. Proceeding.") } } }
// retries a given function until it returns a nil error or the quit channel is // closed. returns true if it exited in the former case, false in the latter. // errors are sent to the given logger with the given string as the message. func RetryOrQuit(f func() error, quit <-chan struct{}, logger logging.Logger, errtext string) bool { for err := f(); err != nil; err = f() { logger.WithError(err).Errorln(errtext) select { case <-quit: return false case <-time.After(1 * time.Second): // unblock the select and loop again } } return true }
// SessionManager continually creates and maintains Consul sessions. It is intended to be // run in its own goroutine. If one session expires, a new one will be created. As // sessions come and go, the session ID (or "" for an expired session) will be sent on the // output channel. // // Parameters: // config: Configuration passed to Consul when creating a new session. // client: The Consul client to use. // output: The channel used for exposing Consul session IDs. This method takes // ownership of this channel and will close it once no new IDs will be created. // done: Close this channel to close the current session (if any) and stop creating // new sessions. // logger: Errors will be logged to this logger. func SessionManager( config api.SessionEntry, client ConsulClient, output chan<- string, done chan struct{}, logger logging.Logger, ) { logger.NoFields().Info("session manager: starting up") for { // Check for exit signal select { case <-done: logger.NoFields().Info("session manager: shutting down") close(output) return default: } // Establish a new session id, _, err := client.Session().CreateNoChecks(&config, nil) if err != nil { logger.WithError(err).Error("session manager: error creating Consul session") time.Sleep(time.Duration(*SessionRetrySeconds) * time.Second) continue } sessionLogger := logger.SubLogger(logrus.Fields{ "session": id, }) sessionLogger.NoFields().Info("session manager: new Consul session") select { case output <- id: // Maintain the session err = client.Session().RenewPeriodic(config.TTL, id, nil, done) if err != nil { sessionLogger.WithError(err).Error("session manager: lost session") } else { sessionLogger.NoFields().Info("session manager: released session") } select { case output <- "": case <-done: } case <-done: // Don't bother reporting the new session if exiting _, _ = client.Session().Destroy(id, nil) sessionLogger.NoFields().Info("session manager: released session") } } }
// Helper to processHealthUpdater() func sendHealthUpdate( logger logging.Logger, w chan<- writeResult, health *WatchResult, doThrottle bool, sender func() error, ) { if err := sender(); err != nil { logger.WithError(err).Error("error writing health") // Try not to overwhelm Consul time.Sleep(time.Duration(*HealthRetryTimeSec) * time.Second) w <- writeResult{nil, false, doThrottle} } else { w <- writeResult{health, true, doThrottle} } }
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger) logger.NoFields().Infoln("Installing pod and launchables") err := pod.Install(pair.Intent, p.artifactVerifier, p.artifactRegistry) if err != nil { // install failed, abort and retry logger.WithError(err).Errorln("Install failed") return false } err = pod.Verify(pair.Intent, p.authPolicy) if err != nil { logger.WithError(err). Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger) if pair.Reality != nil { logger.NoFields().Infoln("Invoking the disable hook and halting runit services") success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err). Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger) logger.NoFields().Infoln("Setting up new runit services and running the enable hook") ok, err := pod.Launch(pair.Intent) if err != nil { logger.WithError(err). Errorln("Launch failed") } else { duration, err := p.store.SetPod(kp.REALITY_TREE, p.node, pair.Intent) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "duration": duration}). Errorln("Could not set pod in reality store") } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger) pod.Prune(p.maxLaunchableDiskUsage, pair.Intent) // errors are logged internally } return err == nil && ok }
// MonitorPodHealth is meant to be a long running go routine. // MonitorPodHealth reads from a consul store to determine which // services should be running on the host. MonitorPodHealth // runs a CheckHealth routine to monitor the health of each // service and kills routines for services that should no // longer be running. func MonitorPodHealth(config *preparer.PreparerConfig, logger *logging.Logger, shutdownCh chan struct{}) { client, err := config.GetConsulClient() if err != nil { // A bad config should have already produced a nice, user-friendly error message. logger.WithError(err).Fatalln("error creating health monitor KV client") } store := kp.NewConsulStore(client) healthManager := store.NewHealthManager(config.NodeName, *logger) node := config.NodeName pods := []PodWatch{} watchQuitCh := make(chan struct{}) watchErrCh := make(chan error) watchPodCh := make(chan []kp.ManifestResult) go store.WatchPods( kp.REALITY_TREE, node, watchQuitCh, watchErrCh, watchPodCh, ) // if GetClient fails it means the certfile/keyfile/cafile were // invalid or did not exist. It makes sense to throw a fatal error secureClient, err := config.GetClient(time.Duration(*HEALTHCHECK_TIMEOUT) * time.Second) if err != nil { logger.WithError(err).Fatalln("failed to get http client for this preparer") } insecureClient, err := config.GetInsecureClient(time.Duration(*HEALTHCHECK_TIMEOUT) * time.Second) if err != nil { logger.WithError(err).Fatalln("failed to get http client for this preparer") } for { select { case results := <-watchPodCh: // check if pods have been added or removed // starts monitor routine for new pods // kills monitor routine for removed pods pods = updatePods(healthManager, secureClient, insecureClient, pods, results, node, logger) case err := <-watchErrCh: logger.WithError(err).Errorln("there was an error reading reality manifests for health monitor") case <-shutdownCh: for _, pod := range pods { pod.shutdownCh <- true } close(watchQuitCh) healthManager.Close() return } } }
// Validates that the rolling update is capable of being processed. If not, an // error is returned. // The following conditions make an RU invalid: // 1) New RC does not exist // 2) Old RC does not exist func (rlf *Farm) validateRoll(update roll_fields.Update, logger logging.Logger) error { _, err := rlf.rcs.Get(update.NewRC) if err == rcstore.NoReplicationController { return fmt.Errorf("RU '%s' is invalid, new RC '%s' did not exist", update.ID(), update.NewRC) } else if err != nil { // There was a potentially transient consul error, we don't necessarily want to delete the RU logger.WithError(err).Errorln("Could not fetch new RC to validate RU, assuming it's valid") } _, err = rlf.rcs.Get(update.OldRC) if err == rcstore.NoReplicationController { return fmt.Errorf("RU '%s' is invalid, old RC '%s' did not exist", update.ID(), update.OldRC) } else if err != nil { // There was a potentially transient consul error, we don't necessarily want to delete the RU logger.WithError(err).Errorln("Could not fetch old RC in order to validate RU, assuming it's valid") } return nil }
func (r *replication) shouldScheduleForNode(node types.NodeName, logger logging.Logger) bool { nodeReality, err := r.queryReality(node) if err != nil { logger.WithError(err).Errorln("Could not read Reality for this node. Will proceed to schedule onto it.") return true } if err == pods.NoCurrentManifest { logger.Infoln("Nothing installed on this node yet.") return true } if nodeReality != nil { nodeRealitySHA, err := nodeReality.SHA() if err != nil { logger.WithError(err).Errorln("Unable to compute manifest SHA for this node. Attempting to schedule anyway") return true } replicationRealitySHA, err := r.manifest.SHA() if err != nil { logger.WithError(err).Errorln("Unable to compute manifest SHA for this daemon set. Attempting to schedule anyway") return true } if nodeRealitySHA == replicationRealitySHA { logger.Info("Reality for this node matches this DS. No action required.") return false } } return true }
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger) err := pod.Install(pair.Intent) if err != nil { // install failed, abort and retry logger.WithError(err).Errorln("Install failed") return false } err = pod.Verify(pair.Intent, p.authPolicy) if err != nil { logger.WithError(err). Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger) if pair.Reality != nil { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err). Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger) ok, err := pod.Launch(pair.Intent) if err != nil { logger.WithError(err). Errorln("Launch failed") } else { duration, err := p.store.SetPod(kp.RealityPath(p.node, pair.ID), pair.Intent) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "duration": duration}). Errorln("Could not set pod in reality store") } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger) } return err == nil && ok }
func (p *Preparer) stopAndUninstallPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err).Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } p.tryRunHooks(hooks.BEFORE_UNINSTALL, pod, pair.Reality, logger) err = pod.Uninstall() if err != nil { logger.WithError(err).Errorln("Uninstall failed") return false } logger.NoFields().Infoln("Successfully uninstalled") dur, err := p.store.DeletePod(kp.REALITY_TREE, p.node, pair.ID) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"duration": dur}). Errorln("Could not delete pod from reality store") } return true }
func (p *Preparer) stopAndUninstallPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err).Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } p.tryRunHooks(hooks.BEFORE_UNINSTALL, pod, pair.Reality, logger) err = pod.Uninstall() if err != nil { logger.WithError(err).Errorln("Uninstall failed") return false } logger.NoFields().Infoln("Successfully uninstalled") if pair.PodUniqueKey == "" { dur, err := p.store.DeletePod(kp.REALITY_TREE, p.node, pair.ID) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"duration": dur}). Errorln("Could not delete pod from reality store") } } else { // We don't delete so that the exit status of the pod's // processes can be viewed for some time after installation. // It is the responsibility of external systems to delete pod // status entries when they are no longer needed. err := p.podStatusStore.MutateStatus(pair.PodUniqueKey, func(podStatus podstatus.PodStatus) (podstatus.PodStatus, error) { podStatus.PodStatus = podstatus.PodRemoved return podStatus, nil }) if err != nil { logger.WithError(err). Errorln("Could not update pod status to reflect removal") } err = p.podStore.DeleteRealityIndex(pair.PodUniqueKey, p.node) if err != nil { logger.WithError(err). Errorln("Could not remove reality index for uninstalled pod") } } return true }
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger) logger.NoFields().Infoln("Installing pod and launchables") err := pod.Install(pair.Intent, p.artifactVerifier, p.artifactRegistry) if err != nil { // install failed, abort and retry logger.WithError(err).Errorln("Install failed") return false } err = pod.Verify(pair.Intent, p.authPolicy) if err != nil { logger.WithError(err). Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger) if pair.Reality != nil { logger.NoFields().Infoln("Invoking the disable hook and halting runit services") success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err). Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger) logger.NoFields().Infoln("Setting up new runit services and running the enable hook") ok, err := pod.Launch(pair.Intent) if err != nil { logger.WithError(err). Errorln("Launch failed") } else { if pair.PodUniqueKey == "" { // legacy pod, write the manifest back to reality tree duration, err := p.store.SetPod(kp.REALITY_TREE, p.node, pair.Intent) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "duration": duration}). Errorln("Could not set pod in reality store") } } else { // TODO: do this in a transaction err = p.podStore.WriteRealityIndex(pair.PodUniqueKey, p.node) if err != nil { logger.WithError(err). Errorln("Could not write uuid index to reality store") } // uuid pod, write the manifest to the pod status tree. mutator := func(ps podstatus.PodStatus) (podstatus.PodStatus, error) { manifestBytes, err := pair.Intent.Marshal() if err != nil { return ps, util.Errorf("Could not convert manifest to string to update pod status") } ps.PodStatus = podstatus.PodLaunched ps.Manifest = string(manifestBytes) return ps, nil } err := p.podStatusStore.MutateStatus(pair.PodUniqueKey, mutator) if err != nil { logger.WithError(err).Errorln("Could not update manifest in pod status") } } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger) pod.Prune(p.maxLaunchableDiskUsage, pair.Intent) // errors are logged internally } return err == nil && ok }
// no return value, no output channels. This should do everything it needs to do // without outside intervention (other than being signalled to quit) func (p *Preparer) handlePods(podChan <-chan ManifestPair, quit <-chan struct{}) { // install new launchables var nextLaunch ManifestPair // used to track if we have work to do (i.e. pod manifest came through channel // and we have yet to operate on it) working := false var manifestLogger logging.Logger for { select { case <-quit: return case nextLaunch = <-podChan: var sha string if nextLaunch.Intent != nil { sha, _ = nextLaunch.Intent.SHA() } else { sha, _ = nextLaunch.Reality.SHA() } manifestLogger = p.Logger.SubLogger(logrus.Fields{ "pod": nextLaunch.ID, "sha": sha, }) manifestLogger.NoFields().Debugln("New manifest received") if nextLaunch.Intent == nil { // if intent=nil then reality!=nil and we need to delete the pod // therefore we must set working=true here working = true } else { // non-nil intent manifests need to be authorized first working = p.authorize(nextLaunch.Intent, manifestLogger) if !working { p.tryRunHooks( hooks.AFTER_AUTH_FAIL, p.podFactory.NewPod(nextLaunch.ID), nextLaunch.Intent, manifestLogger, ) } } case <-time.After(1 * time.Second): if working { pod := p.podFactory.NewPod(nextLaunch.ID) // TODO better solution: force the preparer to have a 0s default timeout, prevent KILLs if pod.Id == POD_ID { pod.DefaultTimeout = time.Duration(0) } effectiveLogBridgeExec := p.logExec // pods that are in the blacklist for this preparer shall not use the // preparer's log exec. Instead, they will use the default svlogd logexec. for _, podID := range p.logBridgeBlacklist { if pod.Id.String() == podID { effectiveLogBridgeExec = svlogdExec break } } pod.SetLogBridgeExec(effectiveLogBridgeExec) pod.SetFinishExec(p.finishExec) // podChan is being fed values gathered from a kp.Watch() in // WatchForPodManifestsForNode(). If the watch returns a new pair of // intent/reality values before the previous change has finished // processing in resolvePair(), the reality value will be stale. This // leads to a bug where the preparer will appear to update a package // and when that is finished, "update" it again. // // The correct solution probably involves watching reality and intent // and feeding updated pairs to a control loop. // // This is a quick fix to ensure that the reality value being used is // up-to-date. The de-bouncing logic in this method should ensure that the // intent value is fresh (to the extent that Consul is timely). Fetching // the reality value again ensures its freshness too. reality, _, err := p.store.Pod(kp.REALITY_TREE, p.node, nextLaunch.ID) if err == pods.NoCurrentManifest { nextLaunch.Reality = nil } else if err != nil { manifestLogger.WithError(err).Errorln("Error getting reality manifest") break } else { nextLaunch.Reality = reality } ok := p.resolvePair(nextLaunch, pod, manifestLogger) if ok { nextLaunch = ManifestPair{} working = false } } } } }
func verifyProcessExit(errCh chan error, tempDir string, logger logging.Logger) { defer close(errCh) // Schedule a uuid pod podUniqueKey, err := createHelloUUIDPod(tempDir, 43772, logger) if err != nil { errCh <- fmt.Errorf("Could not schedule UUID hello pod: %s", err) return } logger = logger.SubLogger(logrus.Fields{ "pod_unique_key": podUniqueKey, }) logger.Infoln("Scheduled hello instance on port 43772") err = verifyHelloRunning(podUniqueKey, logger) if err != nil { errCh <- fmt.Errorf("Couldn't get hello running as a uuid pod: %s", err) return } logger.Infoln("Hello instance launched") time.Sleep(3 * time.Second) logger.Infoln("Waiting for hello instance to listen on 43772") // now wait for the hello server to start running timeout := time.After(30 * time.Second) for { resp, err := http.Get("http://localhost:43772/") if err == nil { resp.Body.Close() break } select { case <-timeout: errCh <- fmt.Errorf("Hello didn't come up listening on 43772: %s", err) return default: } time.Sleep(1 * time.Second) } exitCode := rand.Intn(100) + 1 logger.Infof("Causing hello on 43772 to exit with status %d", exitCode) // Make an http request to hello to make it exit with exitCode. We expect the http request to fail due // to the server exiting, so don't check for http errors. _, err = http.Get(fmt.Sprintf("http://localhost:43772/exit/%d", exitCode)) if err == nil { // This is bad, it means the hello server didn't die and kill our request // in the middle errCh <- util.Errorf("Couldn't kill hello server with http request") return } urlError, ok := err.(*url.Error) if ok && urlError.Err == io.EOF { // This is good, it means the server died } else { errCh <- fmt.Errorf("Couldn't tell hello to die over http: %s", err) return } logger.Infoln("Checking for exit code in SQL database") finishService, err := podprocess.NewSQLiteFinishService(sqliteFinishDatabasePath, logging.DefaultLogger) if err != nil { errCh <- err return } var finishResult podprocess.FinishOutput timeout = time.After(30 * time.Second) for { finishResult, err = finishService.LastFinishForPodUniqueKey(podUniqueKey) if err == nil { break } select { case <-timeout: // Try to manually run the finish script in order to make debugging the test failure easier output, err := exec.Command("sudo", fmt.Sprintf("/var/service/hello-%s__hello__launch/finish", podUniqueKey), "1", "2").CombinedOutput() if err != nil { logger.WithError(err).Infoln("DEBUG: Debug attempt to run finish script failed") } logger.Infof("DEBUG: Output of direct execution of finish script: %s", string(output)) errCh <- fmt.Errorf("Did not find a finish row by the deadline: %s", err) return default: } } if finishResult.PodUniqueKey != podUniqueKey { errCh <- fmt.Errorf("Expected finish result for '%s' but it was for '%s'", podUniqueKey, finishResult.PodUniqueKey) return } if finishResult.ExitCode != exitCode { errCh <- fmt.Errorf("Exit code for '%s' in the sqlite database was expected to be %d but was %d", podUniqueKey, exitCode, finishResult.ExitCode) return } logger.Infoln("Checking for exit code in consul") timeout = time.After(30 * time.Second) podStatusStore := podstatus.NewConsul(statusstore.NewConsul(kp.NewConsulClient(kp.Options{})), kp.PreparerPodStatusNamespace) for { podStatus, _, err := podStatusStore.Get(podUniqueKey) if err != nil { errCh <- err return } found := false for _, processStatus := range podStatus.ProcessStatuses { if processStatus.LaunchableID == "hello" && processStatus.EntryPoint == "launch" { found = true if processStatus.LastExit == nil { errCh <- fmt.Errorf("Found no last exit in consul pod status for %s", podUniqueKey) return } if processStatus.LastExit.ExitCode != exitCode { errCh <- fmt.Errorf("Exit code for '%s' in consul was expected to be %d but was %d", podUniqueKey, exitCode, finishResult.ExitCode) return } } } if found { logger.Infoln("Successful!") break } select { case <-timeout: errCh <- fmt.Errorf("There was no pod process for hello/launch for %s in consul", podUniqueKey) return default: } } }
// Tries to delete the given RU every second until it succeeds func (rlf *Farm) mustDeleteRU(id roll_fields.ID, logger logging.Logger) { for err := rlf.rls.Delete(id); err != nil; err = rlf.rls.Delete(id) { logger.WithError(err).Errorln("Could not delete update") time.Sleep(1 * time.Second) } }
// no return value, no output channels. This should do everything it needs to do // without outside intervention (other than being signalled to quit) func (p *Preparer) handlePods(podChan <-chan ManifestPair, quit <-chan struct{}) { // install new launchables var nextLaunch ManifestPair // used to track if we have work to do (i.e. pod manifest came through channel // and we have yet to operate on it) working := false var manifestLogger logging.Logger // The design of p2-preparer is to continuously retry installation // failures, for example downloading of the launchable. An exponential // backoff is important to avoid putting undue load on the artifact // server, for example. backoffTime := minimumBackoffTime for { select { case <-quit: return case nextLaunch = <-podChan: backoffTime = minimumBackoffTime var sha string // TODO: handle errors appropriately from SHA(). if nextLaunch.Intent != nil { sha, _ = nextLaunch.Intent.SHA() } else { sha, _ = nextLaunch.Reality.SHA() } manifestLogger = p.Logger.SubLogger(logrus.Fields{ "pod": nextLaunch.ID, "sha": sha, "pod_unique_key": nextLaunch.PodUniqueKey, }) manifestLogger.NoFields().Debugln("New manifest received") working = true case <-time.After(backoffTime): if working { var pod *pods.Pod var err error if nextLaunch.PodUniqueKey == "" { pod = p.podFactory.NewLegacyPod(nextLaunch.ID) } else { pod, err = p.podFactory.NewUUIDPod(nextLaunch.ID, nextLaunch.PodUniqueKey) if err != nil { manifestLogger.WithError(err).Errorln("Could not initialize pod") break } } // TODO better solution: force the preparer to have a 0s default timeout, prevent KILLs if pod.Id == constants.PreparerPodID { pod.DefaultTimeout = time.Duration(0) } effectiveLogBridgeExec := p.logExec // pods that are in the blacklist for this preparer shall not use the // preparer's log exec. Instead, they will use the default svlogd logexec. for _, podID := range p.logBridgeBlacklist { if pod.Id.String() == podID { effectiveLogBridgeExec = svlogdExec break } } pod.SetLogBridgeExec(effectiveLogBridgeExec) pod.SetFinishExec(p.finishExec) // podChan is being fed values gathered from a kp.Watch() in // WatchForPodManifestsForNode(). If the watch returns a new pair of // intent/reality values before the previous change has finished // processing in resolvePair(), the reality value will be stale. This // leads to a bug where the preparer will appear to update a package // and when that is finished, "update" it again. // // Example ordering of bad events: // 1) update to /intent for pod A comes in, /reality is read and // resolvePair() handles it // 2) before resolvePair() finishes, another /intent update comes in, // and /reality is read but hasn't been changed. This update cannot // be processed until the previous resolvePair() call finishes, and // updates /reality. Now the reality value used here is stale. We // want to refresh our /reality read so we don't restart the pod if // intent didn't change between updates. // // The correct solution probably involves watching reality and intent // and feeding updated pairs to a control loop. // // This is a quick fix to ensure that the reality value being used is // up-to-date. The de-bouncing logic in this method should ensure that the // intent value is fresh (to the extent that Consul is timely). Fetching // the reality value again ensures its freshness too. if nextLaunch.PodUniqueKey == "" { // legacy pod, get reality manifest from reality tree reality, _, err := p.store.Pod(kp.REALITY_TREE, p.node, nextLaunch.ID) if err == pods.NoCurrentManifest { nextLaunch.Reality = nil } else if err != nil { manifestLogger.WithError(err).Errorln("Error getting reality manifest") break } else { nextLaunch.Reality = reality } } else { // uuid pod, get reality manifest from pod status status, _, err := p.podStatusStore.Get(nextLaunch.PodUniqueKey) switch { case err != nil && !statusstore.IsNoStatus(err): manifestLogger.WithError(err).Errorln("Error getting reality manifest from pod status") break case statusstore.IsNoStatus(err): nextLaunch.Reality = nil default: manifest, err := manifest.FromBytes([]byte(status.Manifest)) if err != nil { manifestLogger.WithError(err).Errorln("Error parsing reality manifest from pod status") break } nextLaunch.Reality = manifest } } ok := p.resolvePair(nextLaunch, pod, manifestLogger) if ok { nextLaunch = ManifestPair{} working = false // Reset the backoff time backoffTime = minimumBackoffTime } else { // Double the backoff time with a maximum of 1 minute backoffTime = backoffTime * 2 if backoffTime > 1*time.Minute { backoffTime = 1 * time.Minute } } } } } }
// SessionManager continually creates and maintains Consul sessions. It is intended to be // run in its own goroutine. If one session expires, a new one will be created. As // sessions come and go, the session ID (or "" for an expired session) will be sent on the // output channel. // // Parameters: // config: Configuration passed to Consul when creating a new session. // client: The Consul client to use. // output: The channel used for exposing Consul session IDs. This method takes // ownership of this channel and will close it once no new IDs will be created. // done: Close this channel to close the current session (if any) and stop creating // new sessions. // logger: Errors will be logged to this logger. func SessionManager( config api.SessionEntry, client ConsulClient, output chan<- string, done chan struct{}, logger logging.Logger, ) { prng := randseed.NewRand() baseDelay := time.Duration(*SessionRetrySeconds) * time.Second maxDelay := time.Duration(*SessionMaxRetrySeconds) * time.Second useDelay := false var delay time.Duration logger.NoFields().Info("session manager: starting up") for { if useDelay { // Normalize timeout range if delay < baseDelay { delay = baseDelay } else if delay > maxDelay { delay = maxDelay } select { case <-time.After(time.Duration(prng.Int63n(int64(delay)))): case <-done: } } else { // Skip the delay on the first loop iteration useDelay = true } // Check for exit signal select { case <-done: logger.NoFields().Info("session manager: shutting down") close(output) return default: } // Establish a new session id, _, err := client.Session().CreateNoChecks(&config, nil) if err != nil { logger.WithError(err).Error("session manager: error creating Consul session") // Exponential backoff delay = delay * 2 continue } successTime := time.Now() delay = baseDelay sessionLogger := logger.SubLogger(logrus.Fields{ "session": id, }) sessionLogger.NoFields().Info("session manager: new Consul session") select { case output <- id: // Maintain the session err = client.Session().RenewPeriodic(config.TTL, id, nil, done) if err != nil { sessionLogger.WithError(err).Error("session manager: lost session") // Session loss is an indicator that Consul is very congested and we must // back off. However, it isn't clear how long to wait for. As a heuristic, // just ensure that "maxDelay" time has passed since the last successful // session creation. A session that doesn't survive long gets delayed a // lot; an infrequent loss gets a low delay. delay = maxDelay - time.Since(successTime) } else { sessionLogger.NoFields().Info("session manager: released session") } select { case output <- "": case <-done: } case <-done: // Don't bother reporting the new session if exiting _, _ = client.Session().Destroy(id, nil) sessionLogger.NoFields().Info("session manager: released session") } } }