func (r *replication) ensureInReality( node types.NodeName, timeoutCh <-chan struct{}, nodeLogger logging.Logger, targetSHA string, ) error { for { select { case <-r.quitCh: return errQuit case <-timeoutCh: return errTimeout case <-r.replicationCancelledCh: return errCancelled case <-time.After(5 * time.Second): man, err := r.queryReality(node) if err == pods.NoCurrentManifest { // if the pod key doesn't exist yet, that's okay just wait longer } else if err != nil { nodeLogger.WithErrorAndFields(err, logrus.Fields{ "node": node, }).Errorln("Could not read reality for pod manifest") } else { receivedSHA, _ := man.SHA() if receivedSHA == targetSHA { nodeLogger.NoFields().Infoln("Node is current") return nil } else { nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current") } } } } }
func (p *Preparer) tryRunHooks(hookType hooks.HookType, pod hooks.Pod, manifest manifest.Manifest, logger logging.Logger) { err := p.hooks.RunHookType(hookType, pod, manifest) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "hooks": hookType}).Warnln("Could not run hooks") } }
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger) logger.NoFields().Infoln("Installing pod and launchables") err := pod.Install(pair.Intent, p.artifactVerifier, p.artifactRegistry) if err != nil { // install failed, abort and retry logger.WithError(err).Errorln("Install failed") return false } err = pod.Verify(pair.Intent, p.authPolicy) if err != nil { logger.WithError(err). Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger) if pair.Reality != nil { logger.NoFields().Infoln("Invoking the disable hook and halting runit services") success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err). Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger) logger.NoFields().Infoln("Setting up new runit services and running the enable hook") ok, err := pod.Launch(pair.Intent) if err != nil { logger.WithError(err). Errorln("Launch failed") } else { duration, err := p.store.SetPod(kp.REALITY_TREE, p.node, pair.Intent) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "duration": duration}). Errorln("Could not set pod in reality store") } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger) pod.Prune(p.maxLaunchableDiskUsage, pair.Intent) // errors are logged internally } return err == nil && ok }
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger) err := pod.Install(pair.Intent) if err != nil { // install failed, abort and retry logger.WithError(err).Errorln("Install failed") return false } err = pod.Verify(pair.Intent, p.authPolicy) if err != nil { logger.WithError(err). Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger) if pair.Reality != nil { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err). Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger) ok, err := pod.Launch(pair.Intent) if err != nil { logger.WithError(err). Errorln("Launch failed") } else { duration, err := p.store.SetPod(kp.RealityPath(p.node, pair.ID), pair.Intent) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "duration": duration}). Errorln("Could not set pod in reality store") } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger) } return err == nil && ok }
func (p *Preparer) stopAndUninstallPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err).Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } p.tryRunHooks(hooks.BEFORE_UNINSTALL, pod, pair.Reality, logger) err = pod.Uninstall() if err != nil { logger.WithError(err).Errorln("Uninstall failed") return false } logger.NoFields().Infoln("Successfully uninstalled") if pair.PodUniqueKey == "" { dur, err := p.store.DeletePod(kp.REALITY_TREE, p.node, pair.ID) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"duration": dur}). Errorln("Could not delete pod from reality store") } } else { // We don't delete so that the exit status of the pod's // processes can be viewed for some time after installation. // It is the responsibility of external systems to delete pod // status entries when they are no longer needed. err := p.podStatusStore.MutateStatus(pair.PodUniqueKey, func(podStatus podstatus.PodStatus) (podstatus.PodStatus, error) { podStatus.PodStatus = podstatus.PodRemoved return podStatus, nil }) if err != nil { logger.WithError(err). Errorln("Could not update pod status to reflect removal") } err = p.podStore.DeleteRealityIndex(pair.PodUniqueKey, p.node) if err != nil { logger.WithError(err). Errorln("Could not remove reality index for uninstalled pod") } } return true }
// runDirectory executes all executable files in a given directory path. func runDirectory(dirpath string, environment []string, logger logging.Logger) error { entries, err := ioutil.ReadDir(dirpath) if os.IsNotExist(err) { logger.WithField("dir", dirpath).Debugln("Hooks not set up") return nil } if err != nil { return err } for _, f := range entries { fullpath := path.Join(dirpath, f.Name()) executable := (f.Mode() & 0111) != 0 if !executable { logger.WithField("path", fullpath).Warnln("Hook is not executable") continue } if f.IsDir() { continue } h := Hook{fullpath, f.Name(), DefaultTimeout, environment, logger} err := h.RunWithTimeout() if htErr, ok := err.(HookTimeoutError); ok { logger.WithErrorAndFields(htErr, logrus.Fields{ "path": h.Path, "hook_name": h.Name, "timeout": h.Timeout, }).Warnln(htErr.Error()) // we intentionally swallow timeout HookTimeoutErrors } else if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "path": h.Path, "hook_name": h.Name, }).Warningf("Unknown error in hook %s: %s", h.Name, err) } } return nil }
func runDirectory(dirpath string, environment []string, logger logging.Logger) error { entries, err := ioutil.ReadDir(dirpath) if os.IsNotExist(err) { logger.WithField("dir", dirpath).Debugln("Hooks not set up") return nil } if err != nil { return err } for _, f := range entries { fullpath := path.Join(dirpath, f.Name()) executable := (f.Mode() & 0111) != 0 if !executable { logger.WithField("path", fullpath).Warnln("Hook is not executable") continue } if f.IsDir() { continue } cmd := exec.Command(fullpath) hookOut := &bytes.Buffer{} cmd.Stdout = hookOut cmd.Stderr = hookOut cmd.Env = environment err := cmd.Run() if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "path": fullpath, "output": hookOut.String(), }).Warnf("Could not execute hook %s", f.Name()) } else { logger.WithFields(logrus.Fields{ "path": fullpath, "output": hookOut.String(), }).Debugln("Executed hook") } } return nil }
func (h *HookDir) runHooks(dirpath string, hType HookType, pod Pod, podManifest manifest.Manifest, logger logging.Logger) error { configFileName, err := podManifest.ConfigFileName() if err != nil { return err } // Write manifest to a file so hooks can read it. tmpManifestFile, err := ioutil.TempFile("", fmt.Sprintf("%s-manifest.yaml", podManifest.ID())) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "dir": dirpath, }).Warnln("Unable to open manifest file for hooks") return err } defer os.Remove(tmpManifestFile.Name()) err = podManifest.Write(tmpManifestFile) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "dir": dirpath, }).Warnln("Unable to write manifest file for hooks") return err } hookEnvironment := []string{ fmt.Sprintf("%s=%s", HOOK_ENV_VAR, path.Base(dirpath)), fmt.Sprintf("%s=%s", HOOK_EVENT_ENV_VAR, hType.String()), fmt.Sprintf("%s=%s", HOOKED_NODE_ENV_VAR, pod.Node()), fmt.Sprintf("%s=%s", HOOKED_POD_ID_ENV_VAR, podManifest.ID()), fmt.Sprintf("%s=%s", HOOKED_POD_HOME_ENV_VAR, pod.Home()), fmt.Sprintf("%s=%s", HOOKED_POD_MANIFEST_ENV_VAR, tmpManifestFile.Name()), fmt.Sprintf("%s=%s", HOOKED_CONFIG_PATH_ENV_VAR, path.Join(pod.ConfigDir(), configFileName)), fmt.Sprintf("%s=%s", HOOKED_ENV_PATH_ENV_VAR, pod.EnvDir()), fmt.Sprintf("%s=%s", HOOKED_CONFIG_DIR_PATH_ENV_VAR, pod.ConfigDir()), fmt.Sprintf("%s=%s", HOOKED_SYSTEM_POD_ROOT_ENV_VAR, h.podRoot), } return runDirectory(dirpath, hookEnvironment, logger) }
func (p *Preparer) stopAndUninstallPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err).Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } p.tryRunHooks(hooks.BEFORE_UNINSTALL, pod, pair.Reality, logger) err = pod.Uninstall() if err != nil { logger.WithError(err).Errorln("Uninstall failed") return false } logger.NoFields().Infoln("Successfully uninstalled") dur, err := p.store.DeletePod(kp.REALITY_TREE, p.node, pair.ID) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"duration": dur}). Errorln("Could not delete pod from reality store") } return true }
// processHealthUpdater() runs in a goroutine to keep Consul in sync with the local health // state. It is written as a non-blocking finite state machine: events arrive and update // internal state, and after each event, the internal state is examined to see if an // asynchronous action needs to be taken. // // Events come from three different sources: // 1. App monitors send their periodic health check results here. When the service is no // longer being checked, the monitor must close this channel. // 2. The session manager sends notifications whenever the current Consul session // expires or is renewed. When the manager exits, it must close this channel. // 3. Writes to Consul are performed in a separate goroutine, and when each finishes, it // notifies the updater of what it just wrote. // // In response to these events, two actions can be taken: // A. Exit, once the app monitor has exited and the health check in Consul has been // removed. // B. Write the recent service state to Consul. At most one outstanding write will be // in-flight at any time. func processHealthUpdater( client consulutil.ConsulClient, checksStream <-chan WatchResult, sessionsStream <-chan string, logger logging.Logger, ) { var localHealth *WatchResult // Health last reported by checker var remoteHealth *WatchResult // Health last written to Consul var session string // Current session var write <-chan writeResult // Future result of an in-flight write var throttle <-chan time.Time // If set, writes are throttled // Track and limit all writes to avoid crushing Consul bucketRefreshRate := time.Minute / time.Duration(*HealthWritesPerMinute) rateLimiter, err := limit.NewTokenBucket( *HealthMaxBucketSize, *HealthMaxBucketSize, bucketRefreshRate, ) if err != nil { panic("invalid token bucket parameters") } logger.NoFields().Debug("starting update loop") for { // Receive event notification; update internal FSM state select { case h, ok := <-checksStream: // The local health checker sent a new result if ok { logger.NoFields().Debug("new health status: ", h.Status) if !healthEquiv(localHealth, &h) { msg := fmt.Sprintf("Service %s is now %s", h.Service, h.Status) if health.Passing.Is(h.Status) { logger.NoFields().Infoln(msg) } else { logger.NoFields().Warnln(msg) } } localHealth = &h } else { logger.NoFields().Debug("check stream closed") checksStream = nil localHealth = nil } case s, ok := <-sessionsStream: // The active Consul session changed if ok { logger.NoFields().Debug("new session: ", s) } else { logger.NoFields().Debug("session stream closed") sessionsStream = nil } session = s // The old health result is deleted when its session expires remoteHealth = nil case result := <-write: // The in-flight write completed logger.NoFields().Debug("write completed: ", result.OK) write = nil if result.OK { remoteHealth = result.Health if result.Throttle && throttle == nil { throttle = time.After(time.Duration(*HealthResumeLimit) * bucketRefreshRate) logger.NoFields().Warningf("Service %s health is flapping; throttling updates", result.Health.Service) } } case <-throttle: throttle = nil logger.NoFields().Warning("health is stable; resuming updates") } // Exit if checksStream == nil && remoteHealth == nil && write == nil { logger.NoFields().Debug("exiting update loop") return } // Send update to Consul if !healthEquiv(localHealth, remoteHealth) && session != "" && write == nil && throttle == nil { writeLogger := logger.SubLogger(logrus.Fields{ "session": session, }) w := make(chan writeResult, 1) if localHealth == nil { // Don't wait on the rate limiter when removing the health status rateLimiter.TryUse(1) logger.NoFields().Debug("deleting remote health") key := HealthPath(remoteHealth.Service, remoteHealth.Node) go sendHealthUpdate(writeLogger, w, nil, false, func() error { _, err := client.KV().Delete(key, nil) if err != nil { return consulutil.NewKVError("delete", key, err) } return nil }) } else { writeHealth := localHealth doThrottle := false if count, _ := rateLimiter.TryUse(1); count <= 1 { // This is the last update before the throttle will be engaged. Write a special // message. logger.NoFields().Debug("writing throttled health") writeHealth = toThrottled(localHealth) doThrottle = true } else { logger.NoFields().Debug("writing remote health") } kv, err := healthToKV(*writeHealth, session) if err != nil { // Practically, this should never happen. logger.WithErrorAndFields(err, logrus.Fields{ "health": *writeHealth, }).Error("could not serialize health update") localHealth = nil continue } if remoteHealth == nil { go sendHealthUpdate(writeLogger, w, localHealth, doThrottle, func() error { ok, _, err := client.KV().Acquire(kv, nil) if err != nil { return consulutil.NewKVError("acquire", kv.Key, err) } if !ok { return fmt.Errorf("write denied") } return nil }) } else { go sendHealthUpdate(writeLogger, w, localHealth, doThrottle, func() error { _, err := client.KV().Put(kv, nil) if err != nil { return consulutil.NewKVError("put", kv.Key, err) } return nil }) } } write = w } } }
// Populates the given directory with executor scripts for each launch script of // the given pod, which must be installed. Any orphaned executor scripts (from a // past install, but no longer present in this pod) will be cleaned out. func InstallHookScripts(dir string, hookPod *pods.Pod, manifest pods.Manifest, logger logging.Logger) error { err := os.MkdirAll(dir, 0755) if err != nil { return err } // TODO: globbing based on the structure of the name is gross, each hook pod // should have its own dir of scripts and running hooks should iterate over // the directories rmPattern := filepath.Join(dir, fmt.Sprintf("%s__*", hookPod.Id)) matches, err := filepath.Glob(rmPattern) if err != nil { // error return from filepath.Glob is guaranteed to be ErrBadPattern return util.Errorf("error while removing old hook scripts: pattern %q is malformed", rmPattern) } for _, match := range matches { err = os.Remove(match) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"script_path": match}).Errorln("Could not remove old hook script") } } launchables, err := hookPod.Launchables(manifest) if err != nil { return err } for _, launchable := range launchables { if launchable.Type() != "hoist" { logger.WithFields(logrus.Fields{ "id": launchable.ID(), "type": launchable.Type(), }).Errorln("hook disabled: unsupported launchable type") continue } executables, err := launchable.Executables(runit.DefaultBuilder) if err != nil { return err } for _, executable := range executables { // Write a script to the event directory that executes the pod's executables // with the correct environment for that pod. scriptPath := filepath.Join(dir, executable.Service.Name) file, err := os.OpenFile(scriptPath, os.O_TRUNC|os.O_CREATE|os.O_WRONLY, 0744) defer file.Close() if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"script_path": scriptPath}).Errorln("Could not open new hook script") } err = executable.WriteExecutor(file) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"script_path": scriptPath}).Errorln("Could not write new hook script") } } // for convenience as we do with regular launchables, make these ones // current under the launchable directory err = launchable.MakeCurrent() if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"launchable_id": launchable.ID()}). Errorln("Could not set hook launchable to current") } } return nil }
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger) logger.NoFields().Infoln("Installing pod and launchables") err := pod.Install(pair.Intent, p.artifactVerifier, p.artifactRegistry) if err != nil { // install failed, abort and retry logger.WithError(err).Errorln("Install failed") return false } err = pod.Verify(pair.Intent, p.authPolicy) if err != nil { logger.WithError(err). Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger) if pair.Reality != nil { logger.NoFields().Infoln("Invoking the disable hook and halting runit services") success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err). Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger) logger.NoFields().Infoln("Setting up new runit services and running the enable hook") ok, err := pod.Launch(pair.Intent) if err != nil { logger.WithError(err). Errorln("Launch failed") } else { if pair.PodUniqueKey == "" { // legacy pod, write the manifest back to reality tree duration, err := p.store.SetPod(kp.REALITY_TREE, p.node, pair.Intent) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "duration": duration}). Errorln("Could not set pod in reality store") } } else { // TODO: do this in a transaction err = p.podStore.WriteRealityIndex(pair.PodUniqueKey, p.node) if err != nil { logger.WithError(err). Errorln("Could not write uuid index to reality store") } // uuid pod, write the manifest to the pod status tree. mutator := func(ps podstatus.PodStatus) (podstatus.PodStatus, error) { manifestBytes, err := pair.Intent.Marshal() if err != nil { return ps, util.Errorf("Could not convert manifest to string to update pod status") } ps.PodStatus = podstatus.PodLaunched ps.Manifest = string(manifestBytes) return ps, nil } err := p.podStatusStore.MutateStatus(pair.PodUniqueKey, mutator) if err != nil { logger.WithError(err).Errorln("Could not update manifest in pod status") } } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger) pod.Prune(p.maxLaunchableDiskUsage, pair.Intent) // errors are logged internally } return err == nil && ok }