Пример #1
0
func (r *replication) ensureInReality(
	node types.NodeName,
	timeoutCh <-chan struct{},
	nodeLogger logging.Logger,
	targetSHA string,
) error {
	for {
		select {
		case <-r.quitCh:
			return errQuit
		case <-timeoutCh:
			return errTimeout
		case <-r.replicationCancelledCh:
			return errCancelled
		case <-time.After(5 * time.Second):
			man, err := r.queryReality(node)
			if err == pods.NoCurrentManifest {
				// if the pod key doesn't exist yet, that's okay just wait longer
			} else if err != nil {
				nodeLogger.WithErrorAndFields(err, logrus.Fields{
					"node": node,
				}).Errorln("Could not read reality for pod manifest")
			} else {
				receivedSHA, _ := man.SHA()
				if receivedSHA == targetSHA {
					nodeLogger.NoFields().Infoln("Node is current")
					return nil
				} else {
					nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current")
				}
			}
		}
	}
}
Пример #2
0
// no return value, no output channels. This should do everything it needs to do
// without outside intervention (other than being signalled to quit)
func (p *Preparer) handlePods(podChan <-chan ManifestPair, quit <-chan struct{}) {
	// install new launchables
	var nextLaunch ManifestPair

	// used to track if we have work to do (i.e. pod manifest came through channel
	// and we have yet to operate on it)
	working := false
	var manifestLogger logging.Logger
	for {
		select {
		case <-quit:
			return
		case nextLaunch = <-podChan:
			var sha string
			if nextLaunch.Intent != nil {
				sha, _ = nextLaunch.Intent.SHA()
			} else {
				sha, _ = nextLaunch.Reality.SHA()
			}
			manifestLogger = p.Logger.SubLogger(logrus.Fields{
				"pod": nextLaunch.ID,
				"sha": sha,
			})
			manifestLogger.NoFields().Debugln("New manifest received")

			if nextLaunch.Intent == nil {
				// if intent=nil then reality!=nil and we need to delete the pod
				// therefore we must set working=true here
				working = true
			} else {
				// non-nil intent manifests need to be authorized first
				working = p.authorize(nextLaunch.Intent, manifestLogger)
				if !working {
					p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pods.NewPod(nextLaunch.ID, pods.PodPath(p.podRoot, nextLaunch.ID)), nextLaunch.Intent, manifestLogger)
				}
			}
		case <-time.After(1 * time.Second):
			if working {
				pod := pods.NewPod(nextLaunch.ID, pods.PodPath(p.podRoot, nextLaunch.ID))

				// TODO better solution: force the preparer to have a 0s default timeout, prevent KILLs
				if pod.Id == POD_ID {
					pod.DefaultTimeout = time.Duration(0)
				}

				ok := p.resolvePair(nextLaunch, pod, manifestLogger)
				if ok {
					nextLaunch = ManifestPair{}
					working = false
				}
			}
		}
	}
}
Пример #3
0
// check if a manifest satisfies the authorization requirement of this preparer
func (p *Preparer) authorize(manifest manifest.Manifest, logger logging.Logger) bool {
	err := p.authPolicy.AuthorizeApp(manifest, logger)
	if err != nil {
		if err, ok := err.(auth.Error); ok {
			logger.WithFields(err.Fields).Errorln(err)
		} else {
			logger.NoFields().Errorln(err)
		}
		return false
	}
	return true
}
Пример #4
0
// SessionManager continually creates and maintains Consul sessions. It is intended to be
// run in its own goroutine. If one session expires, a new one will be created. As
// sessions come and go, the session ID (or "" for an expired session) will be sent on the
// output channel.
//
// Parameters:
//   config:  Configuration passed to Consul when creating a new session.
//   client:  The Consul client to use.
//   output:  The channel used for exposing Consul session IDs. This method takes
//            ownership of this channel and will close it once no new IDs will be created.
//   done:    Close this channel to close the current session (if any) and stop creating
//            new sessions.
//   logger:  Errors will be logged to this logger.
func SessionManager(
	config api.SessionEntry,
	client ConsulClient,
	output chan<- string,
	done chan struct{},
	logger logging.Logger,
) {
	logger.NoFields().Info("session manager: starting up")
	for {
		// Check for exit signal
		select {
		case <-done:
			logger.NoFields().Info("session manager: shutting down")
			close(output)
			return
		default:
		}
		// Establish a new session
		id, _, err := client.Session().CreateNoChecks(&config, nil)
		if err != nil {
			logger.WithError(err).Error("session manager: error creating Consul session")
			time.Sleep(time.Duration(*SessionRetrySeconds) * time.Second)
			continue
		}
		sessionLogger := logger.SubLogger(logrus.Fields{
			"session": id,
		})
		sessionLogger.NoFields().Info("session manager: new Consul session")
		select {
		case output <- id:
			// Maintain the session
			err = client.Session().RenewPeriodic(config.TTL, id, nil, done)
			if err != nil {
				sessionLogger.WithError(err).Error("session manager: lost session")
			} else {
				sessionLogger.NoFields().Info("session manager: released session")
			}
			select {
			case output <- "":
			case <-done:
			}
		case <-done:
			// Don't bother reporting the new session if exiting
			_, _ = client.Session().Destroy(id, nil)
			sessionLogger.NoFields().Info("session manager: released session")
		}
	}
}
Пример #5
0
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool {
	p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger)

	err := pod.Install(pair.Intent)
	if err != nil {
		// install failed, abort and retry
		logger.WithError(err).Errorln("Install failed")
		return false
	}

	err = pod.Verify(pair.Intent, p.authPolicy)
	if err != nil {
		logger.WithError(err).
			Errorln("Pod digest verification failed")
		p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger)
		return false
	}

	p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger)

	if pair.Reality != nil {
		success, err := pod.Halt(pair.Reality)
		if err != nil {
			logger.WithError(err).
				Errorln("Pod halt failed")
		} else if !success {
			logger.NoFields().Warnln("One or more launchables did not halt successfully")
		}
	}

	p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger)

	ok, err := pod.Launch(pair.Intent)
	if err != nil {
		logger.WithError(err).
			Errorln("Launch failed")
	} else {
		duration, err := p.store.SetPod(kp.RealityPath(p.node, pair.ID), pair.Intent)
		if err != nil {
			logger.WithErrorAndFields(err, logrus.Fields{
				"duration": duration}).
				Errorln("Could not set pod in reality store")
		}

		p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger)
	}
	return err == nil && ok
}
Пример #6
0
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool {
	p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger)

	logger.NoFields().Infoln("Installing pod and launchables")

	err := pod.Install(pair.Intent, p.artifactVerifier, p.artifactRegistry)
	if err != nil {
		// install failed, abort and retry
		logger.WithError(err).Errorln("Install failed")
		return false
	}

	err = pod.Verify(pair.Intent, p.authPolicy)
	if err != nil {
		logger.WithError(err).
			Errorln("Pod digest verification failed")
		p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger)
		return false
	}

	p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger)

	if pair.Reality != nil {
		logger.NoFields().Infoln("Invoking the disable hook and halting runit services")
		success, err := pod.Halt(pair.Reality)
		if err != nil {
			logger.WithError(err).
				Errorln("Pod halt failed")
		} else if !success {
			logger.NoFields().Warnln("One or more launchables did not halt successfully")
		}
	}

	p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger)

	logger.NoFields().Infoln("Setting up new runit services and running the enable hook")

	ok, err := pod.Launch(pair.Intent)
	if err != nil {
		logger.WithError(err).
			Errorln("Launch failed")
	} else {
		duration, err := p.store.SetPod(kp.REALITY_TREE, p.node, pair.Intent)
		if err != nil {
			logger.WithErrorAndFields(err, logrus.Fields{
				"duration": duration}).
				Errorln("Could not set pod in reality store")
		}

		p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger)

		pod.Prune(p.maxLaunchableDiskUsage, pair.Intent) // errors are logged internally
	}
	return err == nil && ok
}
Пример #7
0
func (p *Preparer) stopAndUninstallPod(pair ManifestPair, pod Pod, logger logging.Logger) bool {
	success, err := pod.Halt(pair.Reality)
	if err != nil {
		logger.WithError(err).Errorln("Pod halt failed")
	} else if !success {
		logger.NoFields().Warnln("One or more launchables did not halt successfully")
	}

	p.tryRunHooks(hooks.BEFORE_UNINSTALL, pod, pair.Reality, logger)

	err = pod.Uninstall()
	if err != nil {
		logger.WithError(err).Errorln("Uninstall failed")
		return false
	}
	logger.NoFields().Infoln("Successfully uninstalled")

	if pair.PodUniqueKey == "" {
		dur, err := p.store.DeletePod(kp.REALITY_TREE, p.node, pair.ID)
		if err != nil {
			logger.WithErrorAndFields(err, logrus.Fields{"duration": dur}).
				Errorln("Could not delete pod from reality store")
		}
	} else {
		// We don't delete so that the exit status of the pod's
		// processes can be viewed for some time after installation.
		// It is the responsibility of external systems to delete pod
		// status entries when they are no longer needed.
		err := p.podStatusStore.MutateStatus(pair.PodUniqueKey, func(podStatus podstatus.PodStatus) (podstatus.PodStatus, error) {
			podStatus.PodStatus = podstatus.PodRemoved
			return podStatus, nil
		})
		if err != nil {
			logger.WithError(err).
				Errorln("Could not update pod status to reflect removal")
		}

		err = p.podStore.DeleteRealityIndex(pair.PodUniqueKey, p.node)
		if err != nil {
			logger.WithError(err).
				Errorln("Could not remove reality index for uninstalled pod")
		}
	}
	return true
}
Пример #8
0
func (p *Preparer) resolvePair(pair ManifestPair, pod Pod, logger logging.Logger) bool {
	// do not remove the logger argument, it's not the same as p.Logger
	var oldSHA, newSHA string
	if pair.Reality != nil {
		oldSHA, _ = pair.Reality.SHA()
	}
	if pair.Intent != nil {
		newSHA, _ = pair.Intent.SHA()
	}

	if oldSHA == "" {
		logger.NoFields().Infoln("manifest is new, will update")
		return p.installAndLaunchPod(pair, pod, logger)
	}

	if newSHA == "" {
		logger.NoFields().Infoln("manifest was deleted from intent, will remove")
		return p.stopAndUninstallPod(pair, pod, logger)
	}

	if oldSHA == newSHA {
		logger.NoFields().Debugln("manifest is unchanged, no action required")
		return true
	}

	logger.WithField("old_sha", oldSHA).Infoln("manifest SHA has changed, will update")
	return p.installAndLaunchPod(pair, pod, logger)

}
Пример #9
0
// no return value, no output channels. This should do everything it needs to do
// without outside intervention (other than being signalled to quit)
func (p *Preparer) handlePods(podChan <-chan pods.Manifest, quit <-chan struct{}) {
	// install new launchables
	var manifestToLaunch pods.Manifest

	// used to track if we have work to do (i.e. pod manifest came through channel
	// and we have yet to operate on it)
	working := false
	var manifestLogger logging.Logger
	for {
		select {
		case <-quit:
			return
		case manifestToLaunch = <-podChan:
			sha, err := manifestToLaunch.SHA()
			manifestLogger = p.Logger.SubLogger(logrus.Fields{
				"pod":     manifestToLaunch.ID(),
				"sha":     sha,
				"sha_err": err,
			})
			manifestLogger.NoFields().Debugln("New manifest received")

			working = p.authorize(manifestToLaunch, manifestLogger)
			if !working {
				p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pods.NewPod(manifestToLaunch.ID(), pods.PodPath(p.podRoot, manifestToLaunch.ID())), &manifestToLaunch, manifestLogger)
			}
		case <-time.After(1 * time.Second):
			if working {
				pod := pods.NewPod(manifestToLaunch.ID(), pods.PodPath(p.podRoot, manifestToLaunch.ID()))

				ok := p.installAndLaunchPod(&manifestToLaunch, pod, manifestLogger)
				if ok {
					manifestToLaunch = pods.Manifest{}
					working = false
				}
			}
		}
	}
}
Пример #10
0
func (p *Preparer) stopAndUninstallPod(pair ManifestPair, pod Pod, logger logging.Logger) bool {
	success, err := pod.Halt(pair.Reality)
	if err != nil {
		logger.WithError(err).Errorln("Pod halt failed")
	} else if !success {
		logger.NoFields().Warnln("One or more launchables did not halt successfully")
	}

	p.tryRunHooks(hooks.BEFORE_UNINSTALL, pod, pair.Reality, logger)

	err = pod.Uninstall()
	if err != nil {
		logger.WithError(err).Errorln("Uninstall failed")
		return false
	}
	logger.NoFields().Infoln("Successfully uninstalled")

	dur, err := p.store.DeletePod(kp.REALITY_TREE, p.node, pair.ID)
	if err != nil {
		logger.WithErrorAndFields(err, logrus.Fields{"duration": dur}).
			Errorln("Could not delete pod from reality store")
	}
	return true
}
Пример #11
0
func (p *Preparer) resolvePair(pair ManifestPair, pod Pod, logger logging.Logger) bool {
	// do not remove the logger argument, it's not the same as p.Logger
	var oldSHA, newSHA string
	if pair.Reality != nil {
		oldSHA, _ = pair.Reality.SHA()
	}
	if pair.Intent != nil {
		newSHA, _ = pair.Intent.SHA()
	}

	if oldSHA == "" && newSHA != "" {
		logger.NoFields().Infoln("manifest is new, will update")
		authorized := p.authorize(pair.Intent, logger)
		if !authorized {
			p.tryRunHooks(
				hooks.AFTER_AUTH_FAIL,
				pod,
				pair.Intent,
				logger,
			)
			// prevent future unnecessary loops, we don't need to check again.
			return true
		}
		return p.installAndLaunchPod(pair, pod, logger)
	}

	if newSHA == "" {
		logger.NoFields().Infoln("manifest was deleted from intent, will remove")
		return p.stopAndUninstallPod(pair, pod, logger)
	}

	if oldSHA == newSHA {
		logger.NoFields().Debugln("manifest is unchanged, no action required")
		return true
	}

	authorized := p.authorize(pair.Intent, logger)
	if !authorized {
		p.tryRunHooks(
			hooks.AFTER_AUTH_FAIL,
			pod,
			pair.Intent,
			logger,
		)
		// prevent future unnecessary loops, we don't need to check again.
		return true
	}

	logger.WithField("old_sha", oldSHA).Infoln("manifest SHA has changed, will update")
	return p.installAndLaunchPod(pair, pod, logger)

}
Пример #12
0
// no return value, no output channels. This should do everything it needs to do
// without outside intervention (other than being signalled to quit)
func (p *Preparer) handlePods(podChan <-chan ManifestPair, quit <-chan struct{}) {
	// install new launchables
	var nextLaunch ManifestPair

	// used to track if we have work to do (i.e. pod manifest came through channel
	// and we have yet to operate on it)
	working := false
	var manifestLogger logging.Logger
	for {
		select {
		case <-quit:
			return
		case nextLaunch = <-podChan:
			var sha string
			if nextLaunch.Intent != nil {
				sha, _ = nextLaunch.Intent.SHA()
			} else {
				sha, _ = nextLaunch.Reality.SHA()
			}
			manifestLogger = p.Logger.SubLogger(logrus.Fields{
				"pod": nextLaunch.ID,
				"sha": sha,
			})
			manifestLogger.NoFields().Debugln("New manifest received")

			if nextLaunch.Intent == nil {
				// if intent=nil then reality!=nil and we need to delete the pod
				// therefore we must set working=true here
				working = true
			} else {
				// non-nil intent manifests need to be authorized first
				working = p.authorize(nextLaunch.Intent, manifestLogger)
				if !working {
					p.tryRunHooks(
						hooks.AFTER_AUTH_FAIL,
						p.podFactory.NewPod(nextLaunch.ID),
						nextLaunch.Intent,
						manifestLogger,
					)
				}
			}
		case <-time.After(1 * time.Second):
			if working {
				pod := p.podFactory.NewPod(nextLaunch.ID)

				// TODO better solution: force the preparer to have a 0s default timeout, prevent KILLs
				if pod.Id == POD_ID {
					pod.DefaultTimeout = time.Duration(0)
				}

				effectiveLogBridgeExec := p.logExec
				// pods that are in the blacklist for this preparer shall not use the
				// preparer's log exec. Instead, they will use the default svlogd logexec.
				for _, podID := range p.logBridgeBlacklist {
					if pod.Id.String() == podID {
						effectiveLogBridgeExec = svlogdExec
						break
					}
				}
				pod.SetLogBridgeExec(effectiveLogBridgeExec)

				pod.SetFinishExec(p.finishExec)

				// podChan is being fed values gathered from a kp.Watch() in
				// WatchForPodManifestsForNode(). If the watch returns a new pair of
				// intent/reality values before the previous change has finished
				// processing in resolvePair(), the reality value will be stale. This
				// leads to a bug where the preparer will appear to update a package
				// and when that is finished, "update" it again.
				//
				// The correct solution probably involves watching reality and intent
				// and feeding updated pairs to a control loop.
				//
				// This is a quick fix to ensure that the reality value being used is
				// up-to-date. The de-bouncing logic in this method should ensure that the
				// intent value is fresh (to the extent that Consul is timely). Fetching
				// the reality value again ensures its freshness too.
				reality, _, err := p.store.Pod(kp.REALITY_TREE, p.node, nextLaunch.ID)
				if err == pods.NoCurrentManifest {
					nextLaunch.Reality = nil
				} else if err != nil {
					manifestLogger.WithError(err).Errorln("Error getting reality manifest")
					break
				} else {
					nextLaunch.Reality = reality
				}

				ok := p.resolvePair(nextLaunch, pod, manifestLogger)
				if ok {
					nextLaunch = ManifestPair{}
					working = false
				}
			}
		}
	}
}
Пример #13
0
// processHealthUpdater() runs in a goroutine to keep Consul in sync with the local health
// state. It is written as a non-blocking finite state machine: events arrive and update
// internal state, and after each event, the internal state is examined to see if an
// asynchronous action needs to be taken.
//
// Events come from three different sources:
//   1. App monitors send their periodic health check results here. When the service is no
//      longer being checked, the monitor must close this channel.
//   2. The session manager sends notifications whenever the current Consul session
//      expires or is renewed. When the manager exits, it must close this channel.
//   3. Writes to Consul are performed in a separate goroutine, and when each finishes, it
//      notifies the updater of what it just wrote.
//
// In response to these events, two actions can be taken:
//   A. Exit, once the app monitor has exited and the health check in Consul has been
//      removed.
//   B. Write the recent service state to Consul. At most one outstanding write will be
//      in-flight at any time.
func processHealthUpdater(
	client consulutil.ConsulClient,
	checksStream <-chan WatchResult,
	sessionsStream <-chan string,
	logger logging.Logger,
) {
	var localHealth *WatchResult  // Health last reported by checker
	var remoteHealth *WatchResult // Health last written to Consul
	var session string            // Current session

	var write <-chan writeResult  // Future result of an in-flight write
	var throttle <-chan time.Time // If set, writes are throttled

	// Track and limit all writes to avoid crushing Consul
	bucketRefreshRate := time.Minute / time.Duration(*HealthWritesPerMinute)
	rateLimiter, err := limit.NewTokenBucket(
		*HealthMaxBucketSize,
		*HealthMaxBucketSize,
		bucketRefreshRate,
	)
	if err != nil {
		panic("invalid token bucket parameters")
	}

	logger.NoFields().Debug("starting update loop")
	for {
		// Receive event notification; update internal FSM state
		select {
		case h, ok := <-checksStream:
			// The local health checker sent a new result
			if ok {
				logger.NoFields().Debug("new health status: ", h.Status)
				if !healthEquiv(localHealth, &h) {
					msg := fmt.Sprintf("Service %s is now %s", h.Service, h.Status)
					if health.Passing.Is(h.Status) {
						logger.NoFields().Infoln(msg)
					} else {
						logger.NoFields().Warnln(msg)
					}
				}
				localHealth = &h
			} else {
				logger.NoFields().Debug("check stream closed")
				checksStream = nil
				localHealth = nil
			}
		case s, ok := <-sessionsStream:
			// The active Consul session changed
			if ok {
				logger.NoFields().Debug("new session: ", s)
			} else {
				logger.NoFields().Debug("session stream closed")
				sessionsStream = nil
			}
			session = s
			// The old health result is deleted when its session expires
			remoteHealth = nil
		case result := <-write:
			// The in-flight write completed
			logger.NoFields().Debug("write completed: ", result.OK)
			write = nil
			if result.OK {
				remoteHealth = result.Health
				if result.Throttle && throttle == nil {
					throttle = time.After(time.Duration(*HealthResumeLimit) * bucketRefreshRate)
					logger.NoFields().Warningf("Service %s health is flapping; throttling updates", result.Health.Service)
				}
			}
		case <-throttle:
			throttle = nil
			logger.NoFields().Warning("health is stable; resuming updates")
		}

		// Exit
		if checksStream == nil && remoteHealth == nil && write == nil {
			logger.NoFields().Debug("exiting update loop")
			return
		}

		// Send update to Consul
		if !healthEquiv(localHealth, remoteHealth) && session != "" && write == nil &&
			throttle == nil {
			writeLogger := logger.SubLogger(logrus.Fields{
				"session": session,
			})
			w := make(chan writeResult, 1)
			if localHealth == nil {
				// Don't wait on the rate limiter when removing the health status
				rateLimiter.TryUse(1)
				logger.NoFields().Debug("deleting remote health")
				key := HealthPath(remoteHealth.Service, remoteHealth.Node)
				go sendHealthUpdate(writeLogger, w, nil, false, func() error {
					_, err := client.KV().Delete(key, nil)
					if err != nil {
						return consulutil.NewKVError("delete", key, err)
					}
					return nil
				})
			} else {
				writeHealth := localHealth
				doThrottle := false
				if count, _ := rateLimiter.TryUse(1); count <= 1 {
					// This is the last update before the throttle will be engaged. Write a special
					// message.
					logger.NoFields().Debug("writing throttled health")
					writeHealth = toThrottled(localHealth)
					doThrottle = true
				} else {
					logger.NoFields().Debug("writing remote health")
				}
				kv, err := healthToKV(*writeHealth, session)
				if err != nil {
					// Practically, this should never happen.
					logger.WithErrorAndFields(err, logrus.Fields{
						"health": *writeHealth,
					}).Error("could not serialize health update")
					localHealth = nil
					continue
				}
				if remoteHealth == nil {
					go sendHealthUpdate(writeLogger, w, localHealth, doThrottle, func() error {
						ok, _, err := client.KV().Acquire(kv, nil)
						if err != nil {
							return consulutil.NewKVError("acquire", kv.Key, err)
						}
						if !ok {
							return fmt.Errorf("write denied")
						}
						return nil
					})
				} else {
					go sendHealthUpdate(writeLogger, w, localHealth, doThrottle, func() error {
						_, err := client.KV().Put(kv, nil)
						if err != nil {
							return consulutil.NewKVError("put", kv.Key, err)
						}
						return nil
					})
				}
			}
			write = w
		}
	}
}
Пример #14
0
func (p *Preparer) installAndLaunchPod(newManifest *pods.Manifest, pod Pod, logger logging.Logger) bool {
	// do not remove the logger argument, it's not the same as p.Logger

	// get currently running pod to compare with the new pod
	realityPath := kp.RealityPath(p.node, newManifest.ID())
	currentManifest, _, err := p.store.Pod(realityPath)
	currentSHA := ""
	if currentManifest != nil {
		currentSHA, _ = currentManifest.SHA()
	}
	newSHA, _ := newManifest.SHA()

	// if new or the manifest is different, launch
	newOrDifferent := (err == pods.NoCurrentManifest) || (currentSHA != newSHA)
	if newOrDifferent {
		logger.WithFields(logrus.Fields{
			"old_sha": currentSHA,
			"sha":     newSHA,
			"pod":     newManifest.ID(),
		}).Infoln("SHA is new or different from old, will update")
	}

	// if the old manifest is corrupted somehow, re-launch since we don't know if this is an update.
	problemReadingCurrentManifest := (err != nil && err != pods.NoCurrentManifest)
	if problemReadingCurrentManifest {
		logger.WithFields(logrus.Fields{
			"sha":       newSHA,
			"inner_err": err,
		}).Errorln("Current manifest not readable, will relaunch")
	}

	if newOrDifferent || problemReadingCurrentManifest {
		p.tryRunHooks(hooks.BEFORE_INSTALL, pod, newManifest, logger)

		err = pod.Install(newManifest)
		if err != nil {
			// install failed, abort and retry
			logger.WithFields(logrus.Fields{
				"err": err,
			}).Errorln("Install failed")
			return false
		}

		err = pod.Verify(newManifest, p.authPolicy)
		if err != nil {
			logger.WithField("err", err).Errorln("Pod digest verification failed")
			p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, newManifest, logger)
			return false
		}

		p.tryRunHooks(hooks.AFTER_INSTALL, pod, newManifest, logger)

		err = p.store.RegisterService(*newManifest, p.caPath)
		if err != nil {
			logger.WithField("err", err).Errorln("Service registration failed")
			return false
		}

		if currentManifest != nil {
			success, err := pod.Halt(currentManifest)
			if err != nil {
				logger.WithField("err", err).Errorln("Pod halt failed")
			} else if !success {
				logger.NoFields().Warnln("One or more launchables did not halt successfully")
			}
		}

		ok, err := pod.Launch(newManifest)
		if err != nil {
			logger.WithFields(logrus.Fields{
				"err": err,
			}).Errorln("Launch failed")
		} else {
			duration, err := p.store.SetPod(realityPath, *newManifest)
			if err != nil {
				logger.WithFields(logrus.Fields{
					"err":      err,
					"duration": duration,
				}).Errorln("Could not set pod in reality store")
			}

			p.tryRunHooks(hooks.AFTER_LAUNCH, pod, newManifest, logger)
		}
		return err == nil && ok
	}

	// TODO: shut down removed launchables between pod versions.
	return true
}
Пример #15
0
// no return value, no output channels. This should do everything it needs to do
// without outside intervention (other than being signalled to quit)
func (p *Preparer) handlePods(podChan <-chan ManifestPair, quit <-chan struct{}) {
	// install new launchables
	var nextLaunch ManifestPair

	// used to track if we have work to do (i.e. pod manifest came through channel
	// and we have yet to operate on it)
	working := false
	var manifestLogger logging.Logger

	// The design of p2-preparer is to continuously retry installation
	// failures, for example downloading of the launchable. An exponential
	// backoff is important to avoid putting undue load on the artifact
	// server, for example.
	backoffTime := minimumBackoffTime
	for {
		select {
		case <-quit:
			return
		case nextLaunch = <-podChan:
			backoffTime = minimumBackoffTime
			var sha string

			// TODO: handle errors appropriately from SHA().
			if nextLaunch.Intent != nil {
				sha, _ = nextLaunch.Intent.SHA()
			} else {
				sha, _ = nextLaunch.Reality.SHA()
			}
			manifestLogger = p.Logger.SubLogger(logrus.Fields{
				"pod":            nextLaunch.ID,
				"sha":            sha,
				"pod_unique_key": nextLaunch.PodUniqueKey,
			})
			manifestLogger.NoFields().Debugln("New manifest received")

			working = true
		case <-time.After(backoffTime):
			if working {
				var pod *pods.Pod
				var err error
				if nextLaunch.PodUniqueKey == "" {
					pod = p.podFactory.NewLegacyPod(nextLaunch.ID)
				} else {
					pod, err = p.podFactory.NewUUIDPod(nextLaunch.ID, nextLaunch.PodUniqueKey)
					if err != nil {
						manifestLogger.WithError(err).Errorln("Could not initialize pod")
						break
					}
				}

				// TODO better solution: force the preparer to have a 0s default timeout, prevent KILLs
				if pod.Id == constants.PreparerPodID {
					pod.DefaultTimeout = time.Duration(0)
				}

				effectiveLogBridgeExec := p.logExec
				// pods that are in the blacklist for this preparer shall not use the
				// preparer's log exec. Instead, they will use the default svlogd logexec.
				for _, podID := range p.logBridgeBlacklist {
					if pod.Id.String() == podID {
						effectiveLogBridgeExec = svlogdExec
						break
					}
				}
				pod.SetLogBridgeExec(effectiveLogBridgeExec)

				pod.SetFinishExec(p.finishExec)

				// podChan is being fed values gathered from a kp.Watch() in
				// WatchForPodManifestsForNode(). If the watch returns a new pair of
				// intent/reality values before the previous change has finished
				// processing in resolvePair(), the reality value will be stale. This
				// leads to a bug where the preparer will appear to update a package
				// and when that is finished, "update" it again.
				//
				// Example ordering of bad events:
				// 1) update to /intent for pod A comes in, /reality is read and
				// resolvePair() handles it
				// 2) before resolvePair() finishes, another /intent update comes in,
				// and /reality is read but hasn't been changed. This update cannot
				// be processed until the previous resolvePair() call finishes, and
				// updates /reality. Now the reality value used here is stale. We
				// want to refresh our /reality read so we don't restart the pod if
				// intent didn't change between updates.
				//
				// The correct solution probably involves watching reality and intent
				// and feeding updated pairs to a control loop.
				//
				// This is a quick fix to ensure that the reality value being used is
				// up-to-date. The de-bouncing logic in this method should ensure that the
				// intent value is fresh (to the extent that Consul is timely). Fetching
				// the reality value again ensures its freshness too.
				if nextLaunch.PodUniqueKey == "" {
					// legacy pod, get reality manifest from reality tree
					reality, _, err := p.store.Pod(kp.REALITY_TREE, p.node, nextLaunch.ID)
					if err == pods.NoCurrentManifest {
						nextLaunch.Reality = nil
					} else if err != nil {
						manifestLogger.WithError(err).Errorln("Error getting reality manifest")
						break
					} else {
						nextLaunch.Reality = reality
					}
				} else {
					// uuid pod, get reality manifest from pod status
					status, _, err := p.podStatusStore.Get(nextLaunch.PodUniqueKey)
					switch {
					case err != nil && !statusstore.IsNoStatus(err):
						manifestLogger.WithError(err).Errorln("Error getting reality manifest from pod status")
						break
					case statusstore.IsNoStatus(err):
						nextLaunch.Reality = nil
					default:
						manifest, err := manifest.FromBytes([]byte(status.Manifest))
						if err != nil {
							manifestLogger.WithError(err).Errorln("Error parsing reality manifest from pod status")
							break
						}
						nextLaunch.Reality = manifest
					}
				}

				ok := p.resolvePair(nextLaunch, pod, manifestLogger)
				if ok {
					nextLaunch = ManifestPair{}
					working = false

					// Reset the backoff time
					backoffTime = minimumBackoffTime
				} else {
					// Double the backoff time with a maximum of 1 minute
					backoffTime = backoffTime * 2
					if backoffTime > 1*time.Minute {
						backoffTime = 1 * time.Minute
					}
				}
			}
		}
	}
}
Пример #16
0
// SessionManager continually creates and maintains Consul sessions. It is intended to be
// run in its own goroutine. If one session expires, a new one will be created. As
// sessions come and go, the session ID (or "" for an expired session) will be sent on the
// output channel.
//
// Parameters:
//   config:  Configuration passed to Consul when creating a new session.
//   client:  The Consul client to use.
//   output:  The channel used for exposing Consul session IDs. This method takes
//            ownership of this channel and will close it once no new IDs will be created.
//   done:    Close this channel to close the current session (if any) and stop creating
//            new sessions.
//   logger:  Errors will be logged to this logger.
func SessionManager(
	config api.SessionEntry,
	client ConsulClient,
	output chan<- string,
	done chan struct{},
	logger logging.Logger,
) {
	prng := randseed.NewRand()
	baseDelay := time.Duration(*SessionRetrySeconds) * time.Second
	maxDelay := time.Duration(*SessionMaxRetrySeconds) * time.Second
	useDelay := false
	var delay time.Duration

	logger.NoFields().Info("session manager: starting up")
	for {
		if useDelay {
			// Normalize timeout range
			if delay < baseDelay {
				delay = baseDelay
			} else if delay > maxDelay {
				delay = maxDelay
			}
			select {
			case <-time.After(time.Duration(prng.Int63n(int64(delay)))):
			case <-done:
			}
		} else {
			// Skip the delay on the first loop iteration
			useDelay = true
		}
		// Check for exit signal
		select {
		case <-done:
			logger.NoFields().Info("session manager: shutting down")
			close(output)
			return
		default:
		}
		// Establish a new session
		id, _, err := client.Session().CreateNoChecks(&config, nil)
		if err != nil {
			logger.WithError(err).Error("session manager: error creating Consul session")
			// Exponential backoff
			delay = delay * 2
			continue
		}
		successTime := time.Now()
		delay = baseDelay
		sessionLogger := logger.SubLogger(logrus.Fields{
			"session": id,
		})
		sessionLogger.NoFields().Info("session manager: new Consul session")
		select {
		case output <- id:
			// Maintain the session
			err = client.Session().RenewPeriodic(config.TTL, id, nil, done)
			if err != nil {
				sessionLogger.WithError(err).Error("session manager: lost session")
				// Session loss is an indicator that Consul is very congested and we must
				// back off. However, it isn't clear how long to wait for. As a heuristic,
				// just ensure that "maxDelay" time has passed since the last successful
				// session creation. A session that doesn't survive long gets delayed a
				// lot; an infrequent loss gets a low delay.
				delay = maxDelay - time.Since(successTime)
			} else {
				sessionLogger.NoFields().Info("session manager: released session")
			}
			select {
			case output <- "":
			case <-done:
			}
		case <-done:
			// Don't bother reporting the new session if exiting
			_, _ = client.Session().Destroy(id, nil)
			sessionLogger.NoFields().Info("session manager: released session")
		}
	}
}
Пример #17
0
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool {
	p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger)

	logger.NoFields().Infoln("Installing pod and launchables")

	err := pod.Install(pair.Intent, p.artifactVerifier, p.artifactRegistry)
	if err != nil {
		// install failed, abort and retry
		logger.WithError(err).Errorln("Install failed")
		return false
	}

	err = pod.Verify(pair.Intent, p.authPolicy)
	if err != nil {
		logger.WithError(err).
			Errorln("Pod digest verification failed")
		p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger)
		return false
	}

	p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger)

	if pair.Reality != nil {
		logger.NoFields().Infoln("Invoking the disable hook and halting runit services")
		success, err := pod.Halt(pair.Reality)
		if err != nil {
			logger.WithError(err).
				Errorln("Pod halt failed")
		} else if !success {
			logger.NoFields().Warnln("One or more launchables did not halt successfully")
		}
	}

	p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger)

	logger.NoFields().Infoln("Setting up new runit services and running the enable hook")

	ok, err := pod.Launch(pair.Intent)
	if err != nil {
		logger.WithError(err).
			Errorln("Launch failed")
	} else {
		if pair.PodUniqueKey == "" {
			// legacy pod, write the manifest back to reality tree
			duration, err := p.store.SetPod(kp.REALITY_TREE, p.node, pair.Intent)
			if err != nil {
				logger.WithErrorAndFields(err, logrus.Fields{
					"duration": duration}).
					Errorln("Could not set pod in reality store")
			}
		} else {
			// TODO: do this in a transaction
			err = p.podStore.WriteRealityIndex(pair.PodUniqueKey, p.node)
			if err != nil {
				logger.WithError(err).
					Errorln("Could not write uuid index to reality store")
			}

			// uuid pod, write the manifest to the pod status tree.
			mutator := func(ps podstatus.PodStatus) (podstatus.PodStatus, error) {
				manifestBytes, err := pair.Intent.Marshal()
				if err != nil {
					return ps, util.Errorf("Could not convert manifest to string to update pod status")
				}

				ps.PodStatus = podstatus.PodLaunched
				ps.Manifest = string(manifestBytes)
				return ps, nil
			}
			err := p.podStatusStore.MutateStatus(pair.PodUniqueKey, mutator)
			if err != nil {
				logger.WithError(err).Errorln("Could not update manifest in pod status")
			}
		}

		p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger)

		pod.Prune(p.maxLaunchableDiskUsage, pair.Intent) // errors are logged internally
	}
	return err == nil && ok
}