// Create a new Update. The kp.Store, rcstore.Store, labels.Applicator and // scheduler.Scheduler arguments should be the same as those of the RCs themselves. The // session must be valid for the lifetime of the Update; maintaining this is the // responsibility of the caller. func NewUpdate( f fields.Update, kps kp.Store, rcs rcstore.Store, hcheck checker.ConsulHealthChecker, labeler rc.Labeler, sched scheduler.Scheduler, logger logging.Logger, session kp.Session, alerter alerting.Alerter, ) Update { if alerter == nil { alerter = alerting.NewNop() } logger = logger.SubLogger(logrus.Fields{ "desired_replicas": f.DesiredReplicas, "minimum_replicas": f.MinimumReplicas, }) return &update{ Update: f, kps: kps, rcs: rcs, hcheck: hcheck, labeler: labeler, sched: sched, logger: logger, session: session, alerter: alerter, } }
// SessionManager continually creates and maintains Consul sessions. It is intended to be // run in its own goroutine. If one session expires, a new one will be created. As // sessions come and go, the session ID (or "" for an expired session) will be sent on the // output channel. // // Parameters: // config: Configuration passed to Consul when creating a new session. // client: The Consul client to use. // output: The channel used for exposing Consul session IDs. This method takes // ownership of this channel and will close it once no new IDs will be created. // done: Close this channel to close the current session (if any) and stop creating // new sessions. // logger: Errors will be logged to this logger. func SessionManager( config api.SessionEntry, client ConsulClient, output chan<- string, done chan struct{}, logger logging.Logger, ) { logger.NoFields().Info("session manager: starting up") for { // Check for exit signal select { case <-done: logger.NoFields().Info("session manager: shutting down") close(output) return default: } // Establish a new session id, _, err := client.Session().CreateNoChecks(&config, nil) if err != nil { logger.WithError(err).Error("session manager: error creating Consul session") time.Sleep(time.Duration(*SessionRetrySeconds) * time.Second) continue } sessionLogger := logger.SubLogger(logrus.Fields{ "session": id, }) sessionLogger.NoFields().Info("session manager: new Consul session") select { case output <- id: // Maintain the session err = client.Session().RenewPeriodic(config.TTL, id, nil, done) if err != nil { sessionLogger.WithError(err).Error("session manager: lost session") } else { sessionLogger.NoFields().Info("session manager: released session") } select { case output <- "": case <-done: } case <-done: // Don't bother reporting the new session if exiting _, _ = client.Session().Destroy(id, nil) sessionLogger.NoFields().Info("session manager: released session") } } }
// processHealthUpdater() runs in a goroutine to keep Consul in sync with the local health // state. It is written as a non-blocking finite state machine: events arrive and update // internal state, and after each event, the internal state is examined to see if an // asynchronous action needs to be taken. // // Events come from three different sources: // 1. App monitors send their periodic health check results here. When the service is no // longer being checked, the monitor must close this channel. // 2. The session manager sends notifications whenever the current Consul session // expires or is renewed. When the manager exits, it must close this channel. // 3. Writes to Consul are performed in a separate goroutine, and when each finishes, it // notifies the updater of what it just wrote. // // In response to these events, two actions can be taken: // A. Exit, once the app monitor has exited and the health check in Consul has been // removed. // B. Write the recent service state to Consul. At most one outstanding write will be // in-flight at any time. func processHealthUpdater( client consulutil.ConsulClient, checksStream <-chan WatchResult, sessionsStream <-chan string, logger logging.Logger, ) { var localHealth *WatchResult // Health last reported by checker var remoteHealth *WatchResult // Health last written to Consul var session string // Current session var write <-chan writeResult // Future result of an in-flight write var throttle <-chan time.Time // If set, writes are throttled // Track and limit all writes to avoid crushing Consul bucketRefreshRate := time.Minute / time.Duration(*HealthWritesPerMinute) rateLimiter, err := limit.NewTokenBucket( *HealthMaxBucketSize, *HealthMaxBucketSize, bucketRefreshRate, ) if err != nil { panic("invalid token bucket parameters") } logger.NoFields().Debug("starting update loop") for { // Receive event notification; update internal FSM state select { case h, ok := <-checksStream: // The local health checker sent a new result if ok { logger.NoFields().Debug("new health status: ", h.Status) if !healthEquiv(localHealth, &h) { msg := fmt.Sprintf("Service %s is now %s", h.Service, h.Status) if health.Passing.Is(h.Status) { logger.NoFields().Infoln(msg) } else { logger.NoFields().Warnln(msg) } } localHealth = &h } else { logger.NoFields().Debug("check stream closed") checksStream = nil localHealth = nil } case s, ok := <-sessionsStream: // The active Consul session changed if ok { logger.NoFields().Debug("new session: ", s) } else { logger.NoFields().Debug("session stream closed") sessionsStream = nil } session = s // The old health result is deleted when its session expires remoteHealth = nil case result := <-write: // The in-flight write completed logger.NoFields().Debug("write completed: ", result.OK) write = nil if result.OK { remoteHealth = result.Health if result.Throttle && throttle == nil { throttle = time.After(time.Duration(*HealthResumeLimit) * bucketRefreshRate) logger.NoFields().Warningf("Service %s health is flapping; throttling updates", result.Health.Service) } } case <-throttle: throttle = nil logger.NoFields().Warning("health is stable; resuming updates") } // Exit if checksStream == nil && remoteHealth == nil && write == nil { logger.NoFields().Debug("exiting update loop") return } // Send update to Consul if !healthEquiv(localHealth, remoteHealth) && session != "" && write == nil && throttle == nil { writeLogger := logger.SubLogger(logrus.Fields{ "session": session, }) w := make(chan writeResult, 1) if localHealth == nil { // Don't wait on the rate limiter when removing the health status rateLimiter.TryUse(1) logger.NoFields().Debug("deleting remote health") key := HealthPath(remoteHealth.Service, remoteHealth.Node) go sendHealthUpdate(writeLogger, w, nil, false, func() error { _, err := client.KV().Delete(key, nil) if err != nil { return consulutil.NewKVError("delete", key, err) } return nil }) } else { writeHealth := localHealth doThrottle := false if count, _ := rateLimiter.TryUse(1); count <= 1 { // This is the last update before the throttle will be engaged. Write a special // message. logger.NoFields().Debug("writing throttled health") writeHealth = toThrottled(localHealth) doThrottle = true } else { logger.NoFields().Debug("writing remote health") } kv, err := healthToKV(*writeHealth, session) if err != nil { // Practically, this should never happen. logger.WithErrorAndFields(err, logrus.Fields{ "health": *writeHealth, }).Error("could not serialize health update") localHealth = nil continue } if remoteHealth == nil { go sendHealthUpdate(writeLogger, w, localHealth, doThrottle, func() error { ok, _, err := client.KV().Acquire(kv, nil) if err != nil { return consulutil.NewKVError("acquire", kv.Key, err) } if !ok { return fmt.Errorf("write denied") } return nil }) } else { go sendHealthUpdate(writeLogger, w, localHealth, doThrottle, func() error { _, err := client.KV().Put(kv, nil) if err != nil { return consulutil.NewKVError("put", kv.Key, err) } return nil }) } } write = w } } }
func New(preparerConfig *PreparerConfig, logger logging.Logger) (*Preparer, error) { addHooks(preparerConfig, logger) if preparerConfig.ConsulAddress == "" { return nil, util.Errorf("No Consul address given to the preparer") } if preparerConfig.PodRoot == "" { return nil, util.Errorf("No pod root given to the preparer") } if preparerConfig.LogLevel != "" { lv, err := logrus.ParseLevel(preparerConfig.LogLevel) if err != nil { return nil, util.Errorf("Received invalid log level %q", preparerConfig.LogLevel) } logger.Logger.Level = lv } authPolicy, err := getDeployerAuth(preparerConfig) if err != nil { return nil, err } artifactVerifier, err := getArtifactVerifier(preparerConfig, &logger) if err != nil { return nil, err } artifactRegistry, err := getArtifactRegistry(preparerConfig) if err != nil { return nil, err } client, err := preparerConfig.GetConsulClient() if err != nil { return nil, err } statusStore := statusstore.NewConsul(client) podStatusStore := podstatus.NewConsul(statusStore, kp.PreparerPodStatusNamespace) podStore := podstore.NewConsul(client.KV()) store := kp.NewConsulStore(client) maxLaunchableDiskUsage := launch.DefaultAllowableDiskUsage if preparerConfig.MaxLaunchableDiskUsage != "" { maxLaunchableDiskUsage, err = size.Parse(preparerConfig.MaxLaunchableDiskUsage) if err != nil { return nil, util.Errorf("Unparseable value for max_launchable_disk_usage %v, %v", preparerConfig.MaxLaunchableDiskUsage, err) } } err = os.MkdirAll(preparerConfig.PodRoot, 0755) if err != nil { return nil, util.Errorf("Could not create preparer pod directory: %s", err) } var logExec []string if len(preparerConfig.LogExec) > 0 { logExec = preparerConfig.LogExec } else { logExec = runit.DefaultLogExec() } finishExec := pods.NopFinishExec var podProcessReporter *podprocess.Reporter if preparerConfig.PodProcessReporterConfig.FullyConfigured() { podProcessReporterLogger := logger.SubLogger(logrus.Fields{ "component": "PodProcessReporter", }) podProcessReporter, err = podprocess.New(preparerConfig.PodProcessReporterConfig, podProcessReporterLogger, podStatusStore) if err != nil { return nil, err } finishExec = preparerConfig.PodProcessReporterConfig.FinishExec() } var hooksManifest manifest.Manifest var hooksPod *pods.Pod if preparerConfig.HooksManifest != NoHooksSentinelValue { if preparerConfig.HooksManifest == "" { return nil, util.Errorf("Most provide a hooks_manifest or sentinel value %q to indicate that there are no hooks", NoHooksSentinelValue) } hooksManifest, err = manifest.FromBytes([]byte(preparerConfig.HooksManifest)) if err != nil { return nil, util.Errorf("Could not parse configured hooks manifest: %s", err) } hooksPodFactory := pods.NewHookFactory(filepath.Join(preparerConfig.PodRoot, "hooks"), preparerConfig.NodeName) hooksPod = hooksPodFactory.NewHookPod(hooksManifest.ID()) } return &Preparer{ node: preparerConfig.NodeName, store: store, hooks: hooks.Hooks(preparerConfig.HooksDirectory, preparerConfig.PodRoot, &logger), podStatusStore: podStatusStore, podStore: podStore, Logger: logger, podFactory: pods.NewFactory(preparerConfig.PodRoot, preparerConfig.NodeName), authPolicy: authPolicy, maxLaunchableDiskUsage: maxLaunchableDiskUsage, finishExec: finishExec, logExec: logExec, logBridgeBlacklist: preparerConfig.LogBridgeBlacklist, artifactVerifier: artifactVerifier, artifactRegistry: artifactRegistry, PodProcessReporter: podProcessReporter, hooksManifest: hooksManifest, hooksPod: hooksPod, hooksExecDir: preparerConfig.HooksDirectory, }, nil }
// SessionManager continually creates and maintains Consul sessions. It is intended to be // run in its own goroutine. If one session expires, a new one will be created. As // sessions come and go, the session ID (or "" for an expired session) will be sent on the // output channel. // // Parameters: // config: Configuration passed to Consul when creating a new session. // client: The Consul client to use. // output: The channel used for exposing Consul session IDs. This method takes // ownership of this channel and will close it once no new IDs will be created. // done: Close this channel to close the current session (if any) and stop creating // new sessions. // logger: Errors will be logged to this logger. func SessionManager( config api.SessionEntry, client ConsulClient, output chan<- string, done chan struct{}, logger logging.Logger, ) { prng := randseed.NewRand() baseDelay := time.Duration(*SessionRetrySeconds) * time.Second maxDelay := time.Duration(*SessionMaxRetrySeconds) * time.Second useDelay := false var delay time.Duration logger.NoFields().Info("session manager: starting up") for { if useDelay { // Normalize timeout range if delay < baseDelay { delay = baseDelay } else if delay > maxDelay { delay = maxDelay } select { case <-time.After(time.Duration(prng.Int63n(int64(delay)))): case <-done: } } else { // Skip the delay on the first loop iteration useDelay = true } // Check for exit signal select { case <-done: logger.NoFields().Info("session manager: shutting down") close(output) return default: } // Establish a new session id, _, err := client.Session().CreateNoChecks(&config, nil) if err != nil { logger.WithError(err).Error("session manager: error creating Consul session") // Exponential backoff delay = delay * 2 continue } successTime := time.Now() delay = baseDelay sessionLogger := logger.SubLogger(logrus.Fields{ "session": id, }) sessionLogger.NoFields().Info("session manager: new Consul session") select { case output <- id: // Maintain the session err = client.Session().RenewPeriodic(config.TTL, id, nil, done) if err != nil { sessionLogger.WithError(err).Error("session manager: lost session") // Session loss is an indicator that Consul is very congested and we must // back off. However, it isn't clear how long to wait for. As a heuristic, // just ensure that "maxDelay" time has passed since the last successful // session creation. A session that doesn't survive long gets delayed a // lot; an infrequent loss gets a low delay. delay = maxDelay - time.Since(successTime) } else { sessionLogger.NoFields().Info("session manager: released session") } select { case output <- "": case <-done: } case <-done: // Don't bother reporting the new session if exiting _, _ = client.Session().Destroy(id, nil) sessionLogger.NoFields().Info("session manager: released session") } } }
func verifyProcessExit(errCh chan error, tempDir string, logger logging.Logger) { defer close(errCh) // Schedule a uuid pod podUniqueKey, err := createHelloUUIDPod(tempDir, 43772, logger) if err != nil { errCh <- fmt.Errorf("Could not schedule UUID hello pod: %s", err) return } logger = logger.SubLogger(logrus.Fields{ "pod_unique_key": podUniqueKey, }) logger.Infoln("Scheduled hello instance on port 43772") err = verifyHelloRunning(podUniqueKey, logger) if err != nil { errCh <- fmt.Errorf("Couldn't get hello running as a uuid pod: %s", err) return } logger.Infoln("Hello instance launched") time.Sleep(3 * time.Second) logger.Infoln("Waiting for hello instance to listen on 43772") // now wait for the hello server to start running timeout := time.After(30 * time.Second) for { resp, err := http.Get("http://localhost:43772/") if err == nil { resp.Body.Close() break } select { case <-timeout: errCh <- fmt.Errorf("Hello didn't come up listening on 43772: %s", err) return default: } time.Sleep(1 * time.Second) } exitCode := rand.Intn(100) + 1 logger.Infof("Causing hello on 43772 to exit with status %d", exitCode) // Make an http request to hello to make it exit with exitCode. We expect the http request to fail due // to the server exiting, so don't check for http errors. _, err = http.Get(fmt.Sprintf("http://localhost:43772/exit/%d", exitCode)) if err == nil { // This is bad, it means the hello server didn't die and kill our request // in the middle errCh <- util.Errorf("Couldn't kill hello server with http request") return } urlError, ok := err.(*url.Error) if ok && urlError.Err == io.EOF { // This is good, it means the server died } else { errCh <- fmt.Errorf("Couldn't tell hello to die over http: %s", err) return } logger.Infoln("Checking for exit code in SQL database") finishService, err := podprocess.NewSQLiteFinishService(sqliteFinishDatabasePath, logging.DefaultLogger) if err != nil { errCh <- err return } var finishResult podprocess.FinishOutput timeout = time.After(30 * time.Second) for { finishResult, err = finishService.LastFinishForPodUniqueKey(podUniqueKey) if err == nil { break } select { case <-timeout: // Try to manually run the finish script in order to make debugging the test failure easier output, err := exec.Command("sudo", fmt.Sprintf("/var/service/hello-%s__hello__launch/finish", podUniqueKey), "1", "2").CombinedOutput() if err != nil { logger.WithError(err).Infoln("DEBUG: Debug attempt to run finish script failed") } logger.Infof("DEBUG: Output of direct execution of finish script: %s", string(output)) errCh <- fmt.Errorf("Did not find a finish row by the deadline: %s", err) return default: } } if finishResult.PodUniqueKey != podUniqueKey { errCh <- fmt.Errorf("Expected finish result for '%s' but it was for '%s'", podUniqueKey, finishResult.PodUniqueKey) return } if finishResult.ExitCode != exitCode { errCh <- fmt.Errorf("Exit code for '%s' in the sqlite database was expected to be %d but was %d", podUniqueKey, exitCode, finishResult.ExitCode) return } logger.Infoln("Checking for exit code in consul") timeout = time.After(30 * time.Second) podStatusStore := podstatus.NewConsul(statusstore.NewConsul(kp.NewConsulClient(kp.Options{})), kp.PreparerPodStatusNamespace) for { podStatus, _, err := podStatusStore.Get(podUniqueKey) if err != nil { errCh <- err return } found := false for _, processStatus := range podStatus.ProcessStatuses { if processStatus.LaunchableID == "hello" && processStatus.EntryPoint == "launch" { found = true if processStatus.LastExit == nil { errCh <- fmt.Errorf("Found no last exit in consul pod status for %s", podUniqueKey) return } if processStatus.LastExit.ExitCode != exitCode { errCh <- fmt.Errorf("Exit code for '%s' in consul was expected to be %d but was %d", podUniqueKey, exitCode, finishResult.ExitCode) return } } } if found { logger.Infoln("Successful!") break } select { case <-timeout: errCh <- fmt.Errorf("There was no pod process for hello/launch for %s in consul", podUniqueKey) return default: } } }