Esempio n. 1
0
// Create a new Update. The kp.Store, rcstore.Store, labels.Applicator and
// scheduler.Scheduler arguments should be the same as those of the RCs themselves. The
// session must be valid for the lifetime of the Update; maintaining this is the
// responsibility of the caller.
func NewUpdate(
	f fields.Update,
	kps kp.Store,
	rcs rcstore.Store,
	hcheck checker.ConsulHealthChecker,
	labeler rc.Labeler,
	sched scheduler.Scheduler,
	logger logging.Logger,
	session kp.Session,
	alerter alerting.Alerter,
) Update {
	if alerter == nil {
		alerter = alerting.NewNop()
	}

	logger = logger.SubLogger(logrus.Fields{
		"desired_replicas": f.DesiredReplicas,
		"minimum_replicas": f.MinimumReplicas,
	})
	return &update{
		Update:  f,
		kps:     kps,
		rcs:     rcs,
		hcheck:  hcheck,
		labeler: labeler,
		sched:   sched,
		logger:  logger,
		session: session,
		alerter: alerter,
	}
}
Esempio n. 2
0
// SessionManager continually creates and maintains Consul sessions. It is intended to be
// run in its own goroutine. If one session expires, a new one will be created. As
// sessions come and go, the session ID (or "" for an expired session) will be sent on the
// output channel.
//
// Parameters:
//   config:  Configuration passed to Consul when creating a new session.
//   client:  The Consul client to use.
//   output:  The channel used for exposing Consul session IDs. This method takes
//            ownership of this channel and will close it once no new IDs will be created.
//   done:    Close this channel to close the current session (if any) and stop creating
//            new sessions.
//   logger:  Errors will be logged to this logger.
func SessionManager(
	config api.SessionEntry,
	client ConsulClient,
	output chan<- string,
	done chan struct{},
	logger logging.Logger,
) {
	logger.NoFields().Info("session manager: starting up")
	for {
		// Check for exit signal
		select {
		case <-done:
			logger.NoFields().Info("session manager: shutting down")
			close(output)
			return
		default:
		}
		// Establish a new session
		id, _, err := client.Session().CreateNoChecks(&config, nil)
		if err != nil {
			logger.WithError(err).Error("session manager: error creating Consul session")
			time.Sleep(time.Duration(*SessionRetrySeconds) * time.Second)
			continue
		}
		sessionLogger := logger.SubLogger(logrus.Fields{
			"session": id,
		})
		sessionLogger.NoFields().Info("session manager: new Consul session")
		select {
		case output <- id:
			// Maintain the session
			err = client.Session().RenewPeriodic(config.TTL, id, nil, done)
			if err != nil {
				sessionLogger.WithError(err).Error("session manager: lost session")
			} else {
				sessionLogger.NoFields().Info("session manager: released session")
			}
			select {
			case output <- "":
			case <-done:
			}
		case <-done:
			// Don't bother reporting the new session if exiting
			_, _ = client.Session().Destroy(id, nil)
			sessionLogger.NoFields().Info("session manager: released session")
		}
	}
}
Esempio n. 3
0
// processHealthUpdater() runs in a goroutine to keep Consul in sync with the local health
// state. It is written as a non-blocking finite state machine: events arrive and update
// internal state, and after each event, the internal state is examined to see if an
// asynchronous action needs to be taken.
//
// Events come from three different sources:
//   1. App monitors send their periodic health check results here. When the service is no
//      longer being checked, the monitor must close this channel.
//   2. The session manager sends notifications whenever the current Consul session
//      expires or is renewed. When the manager exits, it must close this channel.
//   3. Writes to Consul are performed in a separate goroutine, and when each finishes, it
//      notifies the updater of what it just wrote.
//
// In response to these events, two actions can be taken:
//   A. Exit, once the app monitor has exited and the health check in Consul has been
//      removed.
//   B. Write the recent service state to Consul. At most one outstanding write will be
//      in-flight at any time.
func processHealthUpdater(
	client consulutil.ConsulClient,
	checksStream <-chan WatchResult,
	sessionsStream <-chan string,
	logger logging.Logger,
) {
	var localHealth *WatchResult  // Health last reported by checker
	var remoteHealth *WatchResult // Health last written to Consul
	var session string            // Current session

	var write <-chan writeResult  // Future result of an in-flight write
	var throttle <-chan time.Time // If set, writes are throttled

	// Track and limit all writes to avoid crushing Consul
	bucketRefreshRate := time.Minute / time.Duration(*HealthWritesPerMinute)
	rateLimiter, err := limit.NewTokenBucket(
		*HealthMaxBucketSize,
		*HealthMaxBucketSize,
		bucketRefreshRate,
	)
	if err != nil {
		panic("invalid token bucket parameters")
	}

	logger.NoFields().Debug("starting update loop")
	for {
		// Receive event notification; update internal FSM state
		select {
		case h, ok := <-checksStream:
			// The local health checker sent a new result
			if ok {
				logger.NoFields().Debug("new health status: ", h.Status)
				if !healthEquiv(localHealth, &h) {
					msg := fmt.Sprintf("Service %s is now %s", h.Service, h.Status)
					if health.Passing.Is(h.Status) {
						logger.NoFields().Infoln(msg)
					} else {
						logger.NoFields().Warnln(msg)
					}
				}
				localHealth = &h
			} else {
				logger.NoFields().Debug("check stream closed")
				checksStream = nil
				localHealth = nil
			}
		case s, ok := <-sessionsStream:
			// The active Consul session changed
			if ok {
				logger.NoFields().Debug("new session: ", s)
			} else {
				logger.NoFields().Debug("session stream closed")
				sessionsStream = nil
			}
			session = s
			// The old health result is deleted when its session expires
			remoteHealth = nil
		case result := <-write:
			// The in-flight write completed
			logger.NoFields().Debug("write completed: ", result.OK)
			write = nil
			if result.OK {
				remoteHealth = result.Health
				if result.Throttle && throttle == nil {
					throttle = time.After(time.Duration(*HealthResumeLimit) * bucketRefreshRate)
					logger.NoFields().Warningf("Service %s health is flapping; throttling updates", result.Health.Service)
				}
			}
		case <-throttle:
			throttle = nil
			logger.NoFields().Warning("health is stable; resuming updates")
		}

		// Exit
		if checksStream == nil && remoteHealth == nil && write == nil {
			logger.NoFields().Debug("exiting update loop")
			return
		}

		// Send update to Consul
		if !healthEquiv(localHealth, remoteHealth) && session != "" && write == nil &&
			throttle == nil {
			writeLogger := logger.SubLogger(logrus.Fields{
				"session": session,
			})
			w := make(chan writeResult, 1)
			if localHealth == nil {
				// Don't wait on the rate limiter when removing the health status
				rateLimiter.TryUse(1)
				logger.NoFields().Debug("deleting remote health")
				key := HealthPath(remoteHealth.Service, remoteHealth.Node)
				go sendHealthUpdate(writeLogger, w, nil, false, func() error {
					_, err := client.KV().Delete(key, nil)
					if err != nil {
						return consulutil.NewKVError("delete", key, err)
					}
					return nil
				})
			} else {
				writeHealth := localHealth
				doThrottle := false
				if count, _ := rateLimiter.TryUse(1); count <= 1 {
					// This is the last update before the throttle will be engaged. Write a special
					// message.
					logger.NoFields().Debug("writing throttled health")
					writeHealth = toThrottled(localHealth)
					doThrottle = true
				} else {
					logger.NoFields().Debug("writing remote health")
				}
				kv, err := healthToKV(*writeHealth, session)
				if err != nil {
					// Practically, this should never happen.
					logger.WithErrorAndFields(err, logrus.Fields{
						"health": *writeHealth,
					}).Error("could not serialize health update")
					localHealth = nil
					continue
				}
				if remoteHealth == nil {
					go sendHealthUpdate(writeLogger, w, localHealth, doThrottle, func() error {
						ok, _, err := client.KV().Acquire(kv, nil)
						if err != nil {
							return consulutil.NewKVError("acquire", kv.Key, err)
						}
						if !ok {
							return fmt.Errorf("write denied")
						}
						return nil
					})
				} else {
					go sendHealthUpdate(writeLogger, w, localHealth, doThrottle, func() error {
						_, err := client.KV().Put(kv, nil)
						if err != nil {
							return consulutil.NewKVError("put", kv.Key, err)
						}
						return nil
					})
				}
			}
			write = w
		}
	}
}
Esempio n. 4
0
func New(preparerConfig *PreparerConfig, logger logging.Logger) (*Preparer, error) {
	addHooks(preparerConfig, logger)

	if preparerConfig.ConsulAddress == "" {
		return nil, util.Errorf("No Consul address given to the preparer")
	}
	if preparerConfig.PodRoot == "" {
		return nil, util.Errorf("No pod root given to the preparer")
	}

	if preparerConfig.LogLevel != "" {
		lv, err := logrus.ParseLevel(preparerConfig.LogLevel)
		if err != nil {
			return nil, util.Errorf("Received invalid log level %q", preparerConfig.LogLevel)
		}
		logger.Logger.Level = lv
	}

	authPolicy, err := getDeployerAuth(preparerConfig)
	if err != nil {
		return nil, err
	}

	artifactVerifier, err := getArtifactVerifier(preparerConfig, &logger)
	if err != nil {
		return nil, err
	}

	artifactRegistry, err := getArtifactRegistry(preparerConfig)
	if err != nil {
		return nil, err
	}

	client, err := preparerConfig.GetConsulClient()
	if err != nil {
		return nil, err
	}

	statusStore := statusstore.NewConsul(client)
	podStatusStore := podstatus.NewConsul(statusStore, kp.PreparerPodStatusNamespace)
	podStore := podstore.NewConsul(client.KV())

	store := kp.NewConsulStore(client)

	maxLaunchableDiskUsage := launch.DefaultAllowableDiskUsage
	if preparerConfig.MaxLaunchableDiskUsage != "" {
		maxLaunchableDiskUsage, err = size.Parse(preparerConfig.MaxLaunchableDiskUsage)
		if err != nil {
			return nil, util.Errorf("Unparseable value for max_launchable_disk_usage %v, %v", preparerConfig.MaxLaunchableDiskUsage, err)
		}
	}

	err = os.MkdirAll(preparerConfig.PodRoot, 0755)
	if err != nil {
		return nil, util.Errorf("Could not create preparer pod directory: %s", err)
	}

	var logExec []string
	if len(preparerConfig.LogExec) > 0 {
		logExec = preparerConfig.LogExec
	} else {
		logExec = runit.DefaultLogExec()
	}

	finishExec := pods.NopFinishExec
	var podProcessReporter *podprocess.Reporter
	if preparerConfig.PodProcessReporterConfig.FullyConfigured() {
		podProcessReporterLogger := logger.SubLogger(logrus.Fields{
			"component": "PodProcessReporter",
		})

		podProcessReporter, err = podprocess.New(preparerConfig.PodProcessReporterConfig, podProcessReporterLogger, podStatusStore)
		if err != nil {
			return nil, err
		}

		finishExec = preparerConfig.PodProcessReporterConfig.FinishExec()
	}

	var hooksManifest manifest.Manifest
	var hooksPod *pods.Pod
	if preparerConfig.HooksManifest != NoHooksSentinelValue {
		if preparerConfig.HooksManifest == "" {
			return nil, util.Errorf("Most provide a hooks_manifest or sentinel value %q to indicate that there are no hooks", NoHooksSentinelValue)
		}

		hooksManifest, err = manifest.FromBytes([]byte(preparerConfig.HooksManifest))
		if err != nil {
			return nil, util.Errorf("Could not parse configured hooks manifest: %s", err)
		}
		hooksPodFactory := pods.NewHookFactory(filepath.Join(preparerConfig.PodRoot, "hooks"), preparerConfig.NodeName)
		hooksPod = hooksPodFactory.NewHookPod(hooksManifest.ID())
	}
	return &Preparer{
		node:                   preparerConfig.NodeName,
		store:                  store,
		hooks:                  hooks.Hooks(preparerConfig.HooksDirectory, preparerConfig.PodRoot, &logger),
		podStatusStore:         podStatusStore,
		podStore:               podStore,
		Logger:                 logger,
		podFactory:             pods.NewFactory(preparerConfig.PodRoot, preparerConfig.NodeName),
		authPolicy:             authPolicy,
		maxLaunchableDiskUsage: maxLaunchableDiskUsage,
		finishExec:             finishExec,
		logExec:                logExec,
		logBridgeBlacklist:     preparerConfig.LogBridgeBlacklist,
		artifactVerifier:       artifactVerifier,
		artifactRegistry:       artifactRegistry,
		PodProcessReporter:     podProcessReporter,
		hooksManifest:          hooksManifest,
		hooksPod:               hooksPod,
		hooksExecDir:           preparerConfig.HooksDirectory,
	}, nil
}
Esempio n. 5
0
// SessionManager continually creates and maintains Consul sessions. It is intended to be
// run in its own goroutine. If one session expires, a new one will be created. As
// sessions come and go, the session ID (or "" for an expired session) will be sent on the
// output channel.
//
// Parameters:
//   config:  Configuration passed to Consul when creating a new session.
//   client:  The Consul client to use.
//   output:  The channel used for exposing Consul session IDs. This method takes
//            ownership of this channel and will close it once no new IDs will be created.
//   done:    Close this channel to close the current session (if any) and stop creating
//            new sessions.
//   logger:  Errors will be logged to this logger.
func SessionManager(
	config api.SessionEntry,
	client ConsulClient,
	output chan<- string,
	done chan struct{},
	logger logging.Logger,
) {
	prng := randseed.NewRand()
	baseDelay := time.Duration(*SessionRetrySeconds) * time.Second
	maxDelay := time.Duration(*SessionMaxRetrySeconds) * time.Second
	useDelay := false
	var delay time.Duration

	logger.NoFields().Info("session manager: starting up")
	for {
		if useDelay {
			// Normalize timeout range
			if delay < baseDelay {
				delay = baseDelay
			} else if delay > maxDelay {
				delay = maxDelay
			}
			select {
			case <-time.After(time.Duration(prng.Int63n(int64(delay)))):
			case <-done:
			}
		} else {
			// Skip the delay on the first loop iteration
			useDelay = true
		}
		// Check for exit signal
		select {
		case <-done:
			logger.NoFields().Info("session manager: shutting down")
			close(output)
			return
		default:
		}
		// Establish a new session
		id, _, err := client.Session().CreateNoChecks(&config, nil)
		if err != nil {
			logger.WithError(err).Error("session manager: error creating Consul session")
			// Exponential backoff
			delay = delay * 2
			continue
		}
		successTime := time.Now()
		delay = baseDelay
		sessionLogger := logger.SubLogger(logrus.Fields{
			"session": id,
		})
		sessionLogger.NoFields().Info("session manager: new Consul session")
		select {
		case output <- id:
			// Maintain the session
			err = client.Session().RenewPeriodic(config.TTL, id, nil, done)
			if err != nil {
				sessionLogger.WithError(err).Error("session manager: lost session")
				// Session loss is an indicator that Consul is very congested and we must
				// back off. However, it isn't clear how long to wait for. As a heuristic,
				// just ensure that "maxDelay" time has passed since the last successful
				// session creation. A session that doesn't survive long gets delayed a
				// lot; an infrequent loss gets a low delay.
				delay = maxDelay - time.Since(successTime)
			} else {
				sessionLogger.NoFields().Info("session manager: released session")
			}
			select {
			case output <- "":
			case <-done:
			}
		case <-done:
			// Don't bother reporting the new session if exiting
			_, _ = client.Session().Destroy(id, nil)
			sessionLogger.NoFields().Info("session manager: released session")
		}
	}
}
Esempio n. 6
0
func verifyProcessExit(errCh chan error, tempDir string, logger logging.Logger) {
	defer close(errCh)

	// Schedule a uuid pod
	podUniqueKey, err := createHelloUUIDPod(tempDir, 43772, logger)
	if err != nil {
		errCh <- fmt.Errorf("Could not schedule UUID hello pod: %s", err)
		return
	}

	logger = logger.SubLogger(logrus.Fields{
		"pod_unique_key": podUniqueKey,
	})
	logger.Infoln("Scheduled hello instance on port 43772")

	err = verifyHelloRunning(podUniqueKey, logger)
	if err != nil {
		errCh <- fmt.Errorf("Couldn't get hello running as a uuid pod: %s", err)
		return
	}
	logger.Infoln("Hello instance launched")

	time.Sleep(3 * time.Second)

	logger.Infoln("Waiting for hello instance to listen on 43772")
	// now wait for the hello server to start running
	timeout := time.After(30 * time.Second)
	for {
		resp, err := http.Get("http://localhost:43772/")
		if err == nil {
			resp.Body.Close()
			break
		}

		select {
		case <-timeout:
			errCh <- fmt.Errorf("Hello didn't come up listening on 43772: %s", err)
			return
		default:
		}

		time.Sleep(1 * time.Second)
	}

	exitCode := rand.Intn(100) + 1
	logger.Infof("Causing hello on 43772 to exit with status %d", exitCode)
	// Make an http request to hello to make it exit with exitCode. We expect the http request to fail due
	// to the server exiting, so don't check for http errors.
	_, err = http.Get(fmt.Sprintf("http://localhost:43772/exit/%d", exitCode))
	if err == nil {
		// This is bad, it means the hello server didn't die and kill our request
		// in the middle
		errCh <- util.Errorf("Couldn't kill hello server with http request")
		return
	}

	urlError, ok := err.(*url.Error)
	if ok && urlError.Err == io.EOF {
		// This is good, it means the server died
	} else {
		errCh <- fmt.Errorf("Couldn't tell hello to die over http: %s", err)
		return
	}

	logger.Infoln("Checking for exit code in SQL database")
	finishService, err := podprocess.NewSQLiteFinishService(sqliteFinishDatabasePath, logging.DefaultLogger)
	if err != nil {
		errCh <- err
		return
	}

	var finishResult podprocess.FinishOutput
	timeout = time.After(30 * time.Second)
	for {
		finishResult, err = finishService.LastFinishForPodUniqueKey(podUniqueKey)
		if err == nil {
			break
		}

		select {
		case <-timeout:
			// Try to manually run the finish script in order to make debugging the test failure easier
			output, err := exec.Command("sudo", fmt.Sprintf("/var/service/hello-%s__hello__launch/finish", podUniqueKey), "1", "2").CombinedOutput()
			if err != nil {
				logger.WithError(err).Infoln("DEBUG: Debug attempt to run finish script failed")
			}

			logger.Infof("DEBUG: Output of direct execution of finish script: %s", string(output))

			errCh <- fmt.Errorf("Did not find a finish row by the deadline: %s", err)
			return
		default:
		}
	}

	if finishResult.PodUniqueKey != podUniqueKey {
		errCh <- fmt.Errorf("Expected finish result for '%s' but it was for '%s'", podUniqueKey, finishResult.PodUniqueKey)
		return
	}

	if finishResult.ExitCode != exitCode {
		errCh <- fmt.Errorf("Exit code for '%s' in the sqlite database was expected to be %d but was %d", podUniqueKey, exitCode, finishResult.ExitCode)
		return
	}

	logger.Infoln("Checking for exit code in consul")
	timeout = time.After(30 * time.Second)
	podStatusStore := podstatus.NewConsul(statusstore.NewConsul(kp.NewConsulClient(kp.Options{})), kp.PreparerPodStatusNamespace)
	for {

		podStatus, _, err := podStatusStore.Get(podUniqueKey)
		if err != nil {
			errCh <- err
			return
		}

		found := false
		for _, processStatus := range podStatus.ProcessStatuses {
			if processStatus.LaunchableID == "hello" && processStatus.EntryPoint == "launch" {
				found = true
				if processStatus.LastExit == nil {
					errCh <- fmt.Errorf("Found no last exit in consul pod status for %s", podUniqueKey)
					return
				}

				if processStatus.LastExit.ExitCode != exitCode {
					errCh <- fmt.Errorf("Exit code for '%s' in consul was expected to be %d but was %d", podUniqueKey, exitCode, finishResult.ExitCode)
					return
				}
			}
		}

		if found {
			logger.Infoln("Successful!")
			break
		}

		select {
		case <-timeout:
			errCh <- fmt.Errorf("There was no pod process for hello/launch for %s in consul", podUniqueKey)
			return
		default:
		}
	}
}