func (r *replication) ensureInReality( node types.NodeName, timeoutCh <-chan struct{}, nodeLogger logging.Logger, targetSHA string, ) error { for { select { case <-r.quitCh: return errQuit case <-timeoutCh: return errTimeout case <-r.replicationCancelledCh: return errCancelled case <-time.After(5 * time.Second): man, err := r.queryReality(node) if err == pods.NoCurrentManifest { // if the pod key doesn't exist yet, that's okay just wait longer } else if err != nil { nodeLogger.WithErrorAndFields(err, logrus.Fields{ "node": node, }).Errorln("Could not read reality for pod manifest") } else { receivedSHA, _ := man.SHA() if receivedSHA == targetSHA { nodeLogger.NoFields().Infoln("Node is current") return nil } else { nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current") } } } } }
// no return value, no output channels. This should do everything it needs to do // without outside intervention (other than being signalled to quit) func (p *Preparer) handlePods(podChan <-chan ManifestPair, quit <-chan struct{}) { // install new launchables var nextLaunch ManifestPair // used to track if we have work to do (i.e. pod manifest came through channel // and we have yet to operate on it) working := false var manifestLogger logging.Logger for { select { case <-quit: return case nextLaunch = <-podChan: var sha string if nextLaunch.Intent != nil { sha, _ = nextLaunch.Intent.SHA() } else { sha, _ = nextLaunch.Reality.SHA() } manifestLogger = p.Logger.SubLogger(logrus.Fields{ "pod": nextLaunch.ID, "sha": sha, }) manifestLogger.NoFields().Debugln("New manifest received") if nextLaunch.Intent == nil { // if intent=nil then reality!=nil and we need to delete the pod // therefore we must set working=true here working = true } else { // non-nil intent manifests need to be authorized first working = p.authorize(nextLaunch.Intent, manifestLogger) if !working { p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pods.NewPod(nextLaunch.ID, pods.PodPath(p.podRoot, nextLaunch.ID)), nextLaunch.Intent, manifestLogger) } } case <-time.After(1 * time.Second): if working { pod := pods.NewPod(nextLaunch.ID, pods.PodPath(p.podRoot, nextLaunch.ID)) // TODO better solution: force the preparer to have a 0s default timeout, prevent KILLs if pod.Id == POD_ID { pod.DefaultTimeout = time.Duration(0) } ok := p.resolvePair(nextLaunch, pod, manifestLogger) if ok { nextLaunch = ManifestPair{} working = false } } } } }
// check if a manifest satisfies the authorization requirement of this preparer func (p *Preparer) authorize(manifest manifest.Manifest, logger logging.Logger) bool { err := p.authPolicy.AuthorizeApp(manifest, logger) if err != nil { if err, ok := err.(auth.Error); ok { logger.WithFields(err.Fields).Errorln(err) } else { logger.NoFields().Errorln(err) } return false } return true }
// SessionManager continually creates and maintains Consul sessions. It is intended to be // run in its own goroutine. If one session expires, a new one will be created. As // sessions come and go, the session ID (or "" for an expired session) will be sent on the // output channel. // // Parameters: // config: Configuration passed to Consul when creating a new session. // client: The Consul client to use. // output: The channel used for exposing Consul session IDs. This method takes // ownership of this channel and will close it once no new IDs will be created. // done: Close this channel to close the current session (if any) and stop creating // new sessions. // logger: Errors will be logged to this logger. func SessionManager( config api.SessionEntry, client ConsulClient, output chan<- string, done chan struct{}, logger logging.Logger, ) { logger.NoFields().Info("session manager: starting up") for { // Check for exit signal select { case <-done: logger.NoFields().Info("session manager: shutting down") close(output) return default: } // Establish a new session id, _, err := client.Session().CreateNoChecks(&config, nil) if err != nil { logger.WithError(err).Error("session manager: error creating Consul session") time.Sleep(time.Duration(*SessionRetrySeconds) * time.Second) continue } sessionLogger := logger.SubLogger(logrus.Fields{ "session": id, }) sessionLogger.NoFields().Info("session manager: new Consul session") select { case output <- id: // Maintain the session err = client.Session().RenewPeriodic(config.TTL, id, nil, done) if err != nil { sessionLogger.WithError(err).Error("session manager: lost session") } else { sessionLogger.NoFields().Info("session manager: released session") } select { case output <- "": case <-done: } case <-done: // Don't bother reporting the new session if exiting _, _ = client.Session().Destroy(id, nil) sessionLogger.NoFields().Info("session manager: released session") } } }
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger) err := pod.Install(pair.Intent) if err != nil { // install failed, abort and retry logger.WithError(err).Errorln("Install failed") return false } err = pod.Verify(pair.Intent, p.authPolicy) if err != nil { logger.WithError(err). Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger) if pair.Reality != nil { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err). Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger) ok, err := pod.Launch(pair.Intent) if err != nil { logger.WithError(err). Errorln("Launch failed") } else { duration, err := p.store.SetPod(kp.RealityPath(p.node, pair.ID), pair.Intent) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "duration": duration}). Errorln("Could not set pod in reality store") } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger) } return err == nil && ok }
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger) logger.NoFields().Infoln("Installing pod and launchables") err := pod.Install(pair.Intent, p.artifactVerifier, p.artifactRegistry) if err != nil { // install failed, abort and retry logger.WithError(err).Errorln("Install failed") return false } err = pod.Verify(pair.Intent, p.authPolicy) if err != nil { logger.WithError(err). Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger) if pair.Reality != nil { logger.NoFields().Infoln("Invoking the disable hook and halting runit services") success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err). Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger) logger.NoFields().Infoln("Setting up new runit services and running the enable hook") ok, err := pod.Launch(pair.Intent) if err != nil { logger.WithError(err). Errorln("Launch failed") } else { duration, err := p.store.SetPod(kp.REALITY_TREE, p.node, pair.Intent) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "duration": duration}). Errorln("Could not set pod in reality store") } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger) pod.Prune(p.maxLaunchableDiskUsage, pair.Intent) // errors are logged internally } return err == nil && ok }
func (p *Preparer) stopAndUninstallPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err).Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } p.tryRunHooks(hooks.BEFORE_UNINSTALL, pod, pair.Reality, logger) err = pod.Uninstall() if err != nil { logger.WithError(err).Errorln("Uninstall failed") return false } logger.NoFields().Infoln("Successfully uninstalled") if pair.PodUniqueKey == "" { dur, err := p.store.DeletePod(kp.REALITY_TREE, p.node, pair.ID) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"duration": dur}). Errorln("Could not delete pod from reality store") } } else { // We don't delete so that the exit status of the pod's // processes can be viewed for some time after installation. // It is the responsibility of external systems to delete pod // status entries when they are no longer needed. err := p.podStatusStore.MutateStatus(pair.PodUniqueKey, func(podStatus podstatus.PodStatus) (podstatus.PodStatus, error) { podStatus.PodStatus = podstatus.PodRemoved return podStatus, nil }) if err != nil { logger.WithError(err). Errorln("Could not update pod status to reflect removal") } err = p.podStore.DeleteRealityIndex(pair.PodUniqueKey, p.node) if err != nil { logger.WithError(err). Errorln("Could not remove reality index for uninstalled pod") } } return true }
func (p *Preparer) resolvePair(pair ManifestPair, pod Pod, logger logging.Logger) bool { // do not remove the logger argument, it's not the same as p.Logger var oldSHA, newSHA string if pair.Reality != nil { oldSHA, _ = pair.Reality.SHA() } if pair.Intent != nil { newSHA, _ = pair.Intent.SHA() } if oldSHA == "" { logger.NoFields().Infoln("manifest is new, will update") return p.installAndLaunchPod(pair, pod, logger) } if newSHA == "" { logger.NoFields().Infoln("manifest was deleted from intent, will remove") return p.stopAndUninstallPod(pair, pod, logger) } if oldSHA == newSHA { logger.NoFields().Debugln("manifest is unchanged, no action required") return true } logger.WithField("old_sha", oldSHA).Infoln("manifest SHA has changed, will update") return p.installAndLaunchPod(pair, pod, logger) }
// no return value, no output channels. This should do everything it needs to do // without outside intervention (other than being signalled to quit) func (p *Preparer) handlePods(podChan <-chan pods.Manifest, quit <-chan struct{}) { // install new launchables var manifestToLaunch pods.Manifest // used to track if we have work to do (i.e. pod manifest came through channel // and we have yet to operate on it) working := false var manifestLogger logging.Logger for { select { case <-quit: return case manifestToLaunch = <-podChan: sha, err := manifestToLaunch.SHA() manifestLogger = p.Logger.SubLogger(logrus.Fields{ "pod": manifestToLaunch.ID(), "sha": sha, "sha_err": err, }) manifestLogger.NoFields().Debugln("New manifest received") working = p.authorize(manifestToLaunch, manifestLogger) if !working { p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pods.NewPod(manifestToLaunch.ID(), pods.PodPath(p.podRoot, manifestToLaunch.ID())), &manifestToLaunch, manifestLogger) } case <-time.After(1 * time.Second): if working { pod := pods.NewPod(manifestToLaunch.ID(), pods.PodPath(p.podRoot, manifestToLaunch.ID())) ok := p.installAndLaunchPod(&manifestToLaunch, pod, manifestLogger) if ok { manifestToLaunch = pods.Manifest{} working = false } } } } }
func (p *Preparer) stopAndUninstallPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err).Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } p.tryRunHooks(hooks.BEFORE_UNINSTALL, pod, pair.Reality, logger) err = pod.Uninstall() if err != nil { logger.WithError(err).Errorln("Uninstall failed") return false } logger.NoFields().Infoln("Successfully uninstalled") dur, err := p.store.DeletePod(kp.REALITY_TREE, p.node, pair.ID) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"duration": dur}). Errorln("Could not delete pod from reality store") } return true }
func (p *Preparer) resolvePair(pair ManifestPair, pod Pod, logger logging.Logger) bool { // do not remove the logger argument, it's not the same as p.Logger var oldSHA, newSHA string if pair.Reality != nil { oldSHA, _ = pair.Reality.SHA() } if pair.Intent != nil { newSHA, _ = pair.Intent.SHA() } if oldSHA == "" && newSHA != "" { logger.NoFields().Infoln("manifest is new, will update") authorized := p.authorize(pair.Intent, logger) if !authorized { p.tryRunHooks( hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger, ) // prevent future unnecessary loops, we don't need to check again. return true } return p.installAndLaunchPod(pair, pod, logger) } if newSHA == "" { logger.NoFields().Infoln("manifest was deleted from intent, will remove") return p.stopAndUninstallPod(pair, pod, logger) } if oldSHA == newSHA { logger.NoFields().Debugln("manifest is unchanged, no action required") return true } authorized := p.authorize(pair.Intent, logger) if !authorized { p.tryRunHooks( hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger, ) // prevent future unnecessary loops, we don't need to check again. return true } logger.WithField("old_sha", oldSHA).Infoln("manifest SHA has changed, will update") return p.installAndLaunchPod(pair, pod, logger) }
// no return value, no output channels. This should do everything it needs to do // without outside intervention (other than being signalled to quit) func (p *Preparer) handlePods(podChan <-chan ManifestPair, quit <-chan struct{}) { // install new launchables var nextLaunch ManifestPair // used to track if we have work to do (i.e. pod manifest came through channel // and we have yet to operate on it) working := false var manifestLogger logging.Logger for { select { case <-quit: return case nextLaunch = <-podChan: var sha string if nextLaunch.Intent != nil { sha, _ = nextLaunch.Intent.SHA() } else { sha, _ = nextLaunch.Reality.SHA() } manifestLogger = p.Logger.SubLogger(logrus.Fields{ "pod": nextLaunch.ID, "sha": sha, }) manifestLogger.NoFields().Debugln("New manifest received") if nextLaunch.Intent == nil { // if intent=nil then reality!=nil and we need to delete the pod // therefore we must set working=true here working = true } else { // non-nil intent manifests need to be authorized first working = p.authorize(nextLaunch.Intent, manifestLogger) if !working { p.tryRunHooks( hooks.AFTER_AUTH_FAIL, p.podFactory.NewPod(nextLaunch.ID), nextLaunch.Intent, manifestLogger, ) } } case <-time.After(1 * time.Second): if working { pod := p.podFactory.NewPod(nextLaunch.ID) // TODO better solution: force the preparer to have a 0s default timeout, prevent KILLs if pod.Id == POD_ID { pod.DefaultTimeout = time.Duration(0) } effectiveLogBridgeExec := p.logExec // pods that are in the blacklist for this preparer shall not use the // preparer's log exec. Instead, they will use the default svlogd logexec. for _, podID := range p.logBridgeBlacklist { if pod.Id.String() == podID { effectiveLogBridgeExec = svlogdExec break } } pod.SetLogBridgeExec(effectiveLogBridgeExec) pod.SetFinishExec(p.finishExec) // podChan is being fed values gathered from a kp.Watch() in // WatchForPodManifestsForNode(). If the watch returns a new pair of // intent/reality values before the previous change has finished // processing in resolvePair(), the reality value will be stale. This // leads to a bug where the preparer will appear to update a package // and when that is finished, "update" it again. // // The correct solution probably involves watching reality and intent // and feeding updated pairs to a control loop. // // This is a quick fix to ensure that the reality value being used is // up-to-date. The de-bouncing logic in this method should ensure that the // intent value is fresh (to the extent that Consul is timely). Fetching // the reality value again ensures its freshness too. reality, _, err := p.store.Pod(kp.REALITY_TREE, p.node, nextLaunch.ID) if err == pods.NoCurrentManifest { nextLaunch.Reality = nil } else if err != nil { manifestLogger.WithError(err).Errorln("Error getting reality manifest") break } else { nextLaunch.Reality = reality } ok := p.resolvePair(nextLaunch, pod, manifestLogger) if ok { nextLaunch = ManifestPair{} working = false } } } } }
// processHealthUpdater() runs in a goroutine to keep Consul in sync with the local health // state. It is written as a non-blocking finite state machine: events arrive and update // internal state, and after each event, the internal state is examined to see if an // asynchronous action needs to be taken. // // Events come from three different sources: // 1. App monitors send their periodic health check results here. When the service is no // longer being checked, the monitor must close this channel. // 2. The session manager sends notifications whenever the current Consul session // expires or is renewed. When the manager exits, it must close this channel. // 3. Writes to Consul are performed in a separate goroutine, and when each finishes, it // notifies the updater of what it just wrote. // // In response to these events, two actions can be taken: // A. Exit, once the app monitor has exited and the health check in Consul has been // removed. // B. Write the recent service state to Consul. At most one outstanding write will be // in-flight at any time. func processHealthUpdater( client consulutil.ConsulClient, checksStream <-chan WatchResult, sessionsStream <-chan string, logger logging.Logger, ) { var localHealth *WatchResult // Health last reported by checker var remoteHealth *WatchResult // Health last written to Consul var session string // Current session var write <-chan writeResult // Future result of an in-flight write var throttle <-chan time.Time // If set, writes are throttled // Track and limit all writes to avoid crushing Consul bucketRefreshRate := time.Minute / time.Duration(*HealthWritesPerMinute) rateLimiter, err := limit.NewTokenBucket( *HealthMaxBucketSize, *HealthMaxBucketSize, bucketRefreshRate, ) if err != nil { panic("invalid token bucket parameters") } logger.NoFields().Debug("starting update loop") for { // Receive event notification; update internal FSM state select { case h, ok := <-checksStream: // The local health checker sent a new result if ok { logger.NoFields().Debug("new health status: ", h.Status) if !healthEquiv(localHealth, &h) { msg := fmt.Sprintf("Service %s is now %s", h.Service, h.Status) if health.Passing.Is(h.Status) { logger.NoFields().Infoln(msg) } else { logger.NoFields().Warnln(msg) } } localHealth = &h } else { logger.NoFields().Debug("check stream closed") checksStream = nil localHealth = nil } case s, ok := <-sessionsStream: // The active Consul session changed if ok { logger.NoFields().Debug("new session: ", s) } else { logger.NoFields().Debug("session stream closed") sessionsStream = nil } session = s // The old health result is deleted when its session expires remoteHealth = nil case result := <-write: // The in-flight write completed logger.NoFields().Debug("write completed: ", result.OK) write = nil if result.OK { remoteHealth = result.Health if result.Throttle && throttle == nil { throttle = time.After(time.Duration(*HealthResumeLimit) * bucketRefreshRate) logger.NoFields().Warningf("Service %s health is flapping; throttling updates", result.Health.Service) } } case <-throttle: throttle = nil logger.NoFields().Warning("health is stable; resuming updates") } // Exit if checksStream == nil && remoteHealth == nil && write == nil { logger.NoFields().Debug("exiting update loop") return } // Send update to Consul if !healthEquiv(localHealth, remoteHealth) && session != "" && write == nil && throttle == nil { writeLogger := logger.SubLogger(logrus.Fields{ "session": session, }) w := make(chan writeResult, 1) if localHealth == nil { // Don't wait on the rate limiter when removing the health status rateLimiter.TryUse(1) logger.NoFields().Debug("deleting remote health") key := HealthPath(remoteHealth.Service, remoteHealth.Node) go sendHealthUpdate(writeLogger, w, nil, false, func() error { _, err := client.KV().Delete(key, nil) if err != nil { return consulutil.NewKVError("delete", key, err) } return nil }) } else { writeHealth := localHealth doThrottle := false if count, _ := rateLimiter.TryUse(1); count <= 1 { // This is the last update before the throttle will be engaged. Write a special // message. logger.NoFields().Debug("writing throttled health") writeHealth = toThrottled(localHealth) doThrottle = true } else { logger.NoFields().Debug("writing remote health") } kv, err := healthToKV(*writeHealth, session) if err != nil { // Practically, this should never happen. logger.WithErrorAndFields(err, logrus.Fields{ "health": *writeHealth, }).Error("could not serialize health update") localHealth = nil continue } if remoteHealth == nil { go sendHealthUpdate(writeLogger, w, localHealth, doThrottle, func() error { ok, _, err := client.KV().Acquire(kv, nil) if err != nil { return consulutil.NewKVError("acquire", kv.Key, err) } if !ok { return fmt.Errorf("write denied") } return nil }) } else { go sendHealthUpdate(writeLogger, w, localHealth, doThrottle, func() error { _, err := client.KV().Put(kv, nil) if err != nil { return consulutil.NewKVError("put", kv.Key, err) } return nil }) } } write = w } } }
func (p *Preparer) installAndLaunchPod(newManifest *pods.Manifest, pod Pod, logger logging.Logger) bool { // do not remove the logger argument, it's not the same as p.Logger // get currently running pod to compare with the new pod realityPath := kp.RealityPath(p.node, newManifest.ID()) currentManifest, _, err := p.store.Pod(realityPath) currentSHA := "" if currentManifest != nil { currentSHA, _ = currentManifest.SHA() } newSHA, _ := newManifest.SHA() // if new or the manifest is different, launch newOrDifferent := (err == pods.NoCurrentManifest) || (currentSHA != newSHA) if newOrDifferent { logger.WithFields(logrus.Fields{ "old_sha": currentSHA, "sha": newSHA, "pod": newManifest.ID(), }).Infoln("SHA is new or different from old, will update") } // if the old manifest is corrupted somehow, re-launch since we don't know if this is an update. problemReadingCurrentManifest := (err != nil && err != pods.NoCurrentManifest) if problemReadingCurrentManifest { logger.WithFields(logrus.Fields{ "sha": newSHA, "inner_err": err, }).Errorln("Current manifest not readable, will relaunch") } if newOrDifferent || problemReadingCurrentManifest { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, newManifest, logger) err = pod.Install(newManifest) if err != nil { // install failed, abort and retry logger.WithFields(logrus.Fields{ "err": err, }).Errorln("Install failed") return false } err = pod.Verify(newManifest, p.authPolicy) if err != nil { logger.WithField("err", err).Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, newManifest, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, newManifest, logger) err = p.store.RegisterService(*newManifest, p.caPath) if err != nil { logger.WithField("err", err).Errorln("Service registration failed") return false } if currentManifest != nil { success, err := pod.Halt(currentManifest) if err != nil { logger.WithField("err", err).Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } ok, err := pod.Launch(newManifest) if err != nil { logger.WithFields(logrus.Fields{ "err": err, }).Errorln("Launch failed") } else { duration, err := p.store.SetPod(realityPath, *newManifest) if err != nil { logger.WithFields(logrus.Fields{ "err": err, "duration": duration, }).Errorln("Could not set pod in reality store") } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, newManifest, logger) } return err == nil && ok } // TODO: shut down removed launchables between pod versions. return true }
// no return value, no output channels. This should do everything it needs to do // without outside intervention (other than being signalled to quit) func (p *Preparer) handlePods(podChan <-chan ManifestPair, quit <-chan struct{}) { // install new launchables var nextLaunch ManifestPair // used to track if we have work to do (i.e. pod manifest came through channel // and we have yet to operate on it) working := false var manifestLogger logging.Logger // The design of p2-preparer is to continuously retry installation // failures, for example downloading of the launchable. An exponential // backoff is important to avoid putting undue load on the artifact // server, for example. backoffTime := minimumBackoffTime for { select { case <-quit: return case nextLaunch = <-podChan: backoffTime = minimumBackoffTime var sha string // TODO: handle errors appropriately from SHA(). if nextLaunch.Intent != nil { sha, _ = nextLaunch.Intent.SHA() } else { sha, _ = nextLaunch.Reality.SHA() } manifestLogger = p.Logger.SubLogger(logrus.Fields{ "pod": nextLaunch.ID, "sha": sha, "pod_unique_key": nextLaunch.PodUniqueKey, }) manifestLogger.NoFields().Debugln("New manifest received") working = true case <-time.After(backoffTime): if working { var pod *pods.Pod var err error if nextLaunch.PodUniqueKey == "" { pod = p.podFactory.NewLegacyPod(nextLaunch.ID) } else { pod, err = p.podFactory.NewUUIDPod(nextLaunch.ID, nextLaunch.PodUniqueKey) if err != nil { manifestLogger.WithError(err).Errorln("Could not initialize pod") break } } // TODO better solution: force the preparer to have a 0s default timeout, prevent KILLs if pod.Id == constants.PreparerPodID { pod.DefaultTimeout = time.Duration(0) } effectiveLogBridgeExec := p.logExec // pods that are in the blacklist for this preparer shall not use the // preparer's log exec. Instead, they will use the default svlogd logexec. for _, podID := range p.logBridgeBlacklist { if pod.Id.String() == podID { effectiveLogBridgeExec = svlogdExec break } } pod.SetLogBridgeExec(effectiveLogBridgeExec) pod.SetFinishExec(p.finishExec) // podChan is being fed values gathered from a kp.Watch() in // WatchForPodManifestsForNode(). If the watch returns a new pair of // intent/reality values before the previous change has finished // processing in resolvePair(), the reality value will be stale. This // leads to a bug where the preparer will appear to update a package // and when that is finished, "update" it again. // // Example ordering of bad events: // 1) update to /intent for pod A comes in, /reality is read and // resolvePair() handles it // 2) before resolvePair() finishes, another /intent update comes in, // and /reality is read but hasn't been changed. This update cannot // be processed until the previous resolvePair() call finishes, and // updates /reality. Now the reality value used here is stale. We // want to refresh our /reality read so we don't restart the pod if // intent didn't change between updates. // // The correct solution probably involves watching reality and intent // and feeding updated pairs to a control loop. // // This is a quick fix to ensure that the reality value being used is // up-to-date. The de-bouncing logic in this method should ensure that the // intent value is fresh (to the extent that Consul is timely). Fetching // the reality value again ensures its freshness too. if nextLaunch.PodUniqueKey == "" { // legacy pod, get reality manifest from reality tree reality, _, err := p.store.Pod(kp.REALITY_TREE, p.node, nextLaunch.ID) if err == pods.NoCurrentManifest { nextLaunch.Reality = nil } else if err != nil { manifestLogger.WithError(err).Errorln("Error getting reality manifest") break } else { nextLaunch.Reality = reality } } else { // uuid pod, get reality manifest from pod status status, _, err := p.podStatusStore.Get(nextLaunch.PodUniqueKey) switch { case err != nil && !statusstore.IsNoStatus(err): manifestLogger.WithError(err).Errorln("Error getting reality manifest from pod status") break case statusstore.IsNoStatus(err): nextLaunch.Reality = nil default: manifest, err := manifest.FromBytes([]byte(status.Manifest)) if err != nil { manifestLogger.WithError(err).Errorln("Error parsing reality manifest from pod status") break } nextLaunch.Reality = manifest } } ok := p.resolvePair(nextLaunch, pod, manifestLogger) if ok { nextLaunch = ManifestPair{} working = false // Reset the backoff time backoffTime = minimumBackoffTime } else { // Double the backoff time with a maximum of 1 minute backoffTime = backoffTime * 2 if backoffTime > 1*time.Minute { backoffTime = 1 * time.Minute } } } } } }
// SessionManager continually creates and maintains Consul sessions. It is intended to be // run in its own goroutine. If one session expires, a new one will be created. As // sessions come and go, the session ID (or "" for an expired session) will be sent on the // output channel. // // Parameters: // config: Configuration passed to Consul when creating a new session. // client: The Consul client to use. // output: The channel used for exposing Consul session IDs. This method takes // ownership of this channel and will close it once no new IDs will be created. // done: Close this channel to close the current session (if any) and stop creating // new sessions. // logger: Errors will be logged to this logger. func SessionManager( config api.SessionEntry, client ConsulClient, output chan<- string, done chan struct{}, logger logging.Logger, ) { prng := randseed.NewRand() baseDelay := time.Duration(*SessionRetrySeconds) * time.Second maxDelay := time.Duration(*SessionMaxRetrySeconds) * time.Second useDelay := false var delay time.Duration logger.NoFields().Info("session manager: starting up") for { if useDelay { // Normalize timeout range if delay < baseDelay { delay = baseDelay } else if delay > maxDelay { delay = maxDelay } select { case <-time.After(time.Duration(prng.Int63n(int64(delay)))): case <-done: } } else { // Skip the delay on the first loop iteration useDelay = true } // Check for exit signal select { case <-done: logger.NoFields().Info("session manager: shutting down") close(output) return default: } // Establish a new session id, _, err := client.Session().CreateNoChecks(&config, nil) if err != nil { logger.WithError(err).Error("session manager: error creating Consul session") // Exponential backoff delay = delay * 2 continue } successTime := time.Now() delay = baseDelay sessionLogger := logger.SubLogger(logrus.Fields{ "session": id, }) sessionLogger.NoFields().Info("session manager: new Consul session") select { case output <- id: // Maintain the session err = client.Session().RenewPeriodic(config.TTL, id, nil, done) if err != nil { sessionLogger.WithError(err).Error("session manager: lost session") // Session loss is an indicator that Consul is very congested and we must // back off. However, it isn't clear how long to wait for. As a heuristic, // just ensure that "maxDelay" time has passed since the last successful // session creation. A session that doesn't survive long gets delayed a // lot; an infrequent loss gets a low delay. delay = maxDelay - time.Since(successTime) } else { sessionLogger.NoFields().Info("session manager: released session") } select { case output <- "": case <-done: } case <-done: // Don't bother reporting the new session if exiting _, _ = client.Session().Destroy(id, nil) sessionLogger.NoFields().Info("session manager: released session") } } }
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger) logger.NoFields().Infoln("Installing pod and launchables") err := pod.Install(pair.Intent, p.artifactVerifier, p.artifactRegistry) if err != nil { // install failed, abort and retry logger.WithError(err).Errorln("Install failed") return false } err = pod.Verify(pair.Intent, p.authPolicy) if err != nil { logger.WithError(err). Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger) if pair.Reality != nil { logger.NoFields().Infoln("Invoking the disable hook and halting runit services") success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err). Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger) logger.NoFields().Infoln("Setting up new runit services and running the enable hook") ok, err := pod.Launch(pair.Intent) if err != nil { logger.WithError(err). Errorln("Launch failed") } else { if pair.PodUniqueKey == "" { // legacy pod, write the manifest back to reality tree duration, err := p.store.SetPod(kp.REALITY_TREE, p.node, pair.Intent) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "duration": duration}). Errorln("Could not set pod in reality store") } } else { // TODO: do this in a transaction err = p.podStore.WriteRealityIndex(pair.PodUniqueKey, p.node) if err != nil { logger.WithError(err). Errorln("Could not write uuid index to reality store") } // uuid pod, write the manifest to the pod status tree. mutator := func(ps podstatus.PodStatus) (podstatus.PodStatus, error) { manifestBytes, err := pair.Intent.Marshal() if err != nil { return ps, util.Errorf("Could not convert manifest to string to update pod status") } ps.PodStatus = podstatus.PodLaunched ps.Manifest = string(manifestBytes) return ps, nil } err := p.podStatusStore.MutateStatus(pair.PodUniqueKey, mutator) if err != nil { logger.WithError(err).Errorln("Could not update manifest in pod status") } } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger) pod.Prune(p.maxLaunchableDiskUsage, pair.Intent) // errors are logged internally } return err == nil && ok }