func VerifyReality(waitTime time.Duration, consulID, agentID string) error { quit := make(chan struct{}) defer close(quit) store := kp.NewConsulStore(kp.Options{ Token: *consulToken, }) hostname, _ := os.Hostname() waitChan := time.After(waitTime) for { select { case <-waitChan: return util.Errorf("Consul and/or Preparer weren't in the reality store within %s", waitTime) case <-time.After(100 * time.Millisecond): hasConsul := false hasPreparer := false results, _, err := store.ListPods(kp.RealityPath(hostname)) if err != nil { log.Printf("Error looking for pods: %s\n", err) continue } for _, res := range results { if res.Manifest.ID() == consulID { hasConsul = true } else if res.Manifest.ID() == agentID { hasPreparer = true } } if hasConsul && hasPreparer { return nil } } } }
func (u update) countHealthy(id rcf.ID, checks map[string]health.Result) (rcNodeCounts, error) { ret := rcNodeCounts{} rcFields, err := u.rcs.Get(id) if err != nil { return ret, err } ret.Desired = rcFields.ReplicasDesired nodes, err := rc.New(rcFields, u.kps, u.rcs, u.sched, u.labeler, u.logger).CurrentNodes() if err != nil { return ret, err } ret.Current = len(nodes) for _, node := range nodes { // TODO: is reality checking an rc-layer concern? realManifest, _, err := u.kps.Pod(kp.RealityPath(node, rcFields.Manifest.ID())) if err != nil { return ret, err } realSHA, _ := realManifest.SHA() targetSHA, _ := rcFields.Manifest.SHA() if targetSHA == realSHA { ret.Real++ } else { // don't check health if the update isn't even done there yet continue } if hres, ok := checks[node]; ok && hres.Status == health.Passing { ret.Healthy++ } } return ret, err }
// Checks that the preparer is running on every host being deployed to. func (r Replicator) CheckPreparers() error { for _, host := range r.Nodes { _, _, err := r.Store.Pod(kp.RealityPath(host, preparer.POD_ID)) if err != nil { return util.Errorf("Host %q does not have a preparer", host) } } return nil }
// Checks that the preparer is running on every host being deployed to. func (r replicator) checkPreparers() error { for _, host := range r.nodes { _, _, err := r.store.Pod(kp.RealityPath(host, preparer.POD_ID)) if err != nil { return util.Errorf("Could not verify %v state on %q: %v", preparer.POD_ID, host, err) } } return nil }
func updateHealthMonitors(store kp.Store, watchedPods []PodWatch, node string, logger *logging.Logger) []PodWatch { path := kp.RealityPath(node) reality, _, err := store.ListPods(path) if err != nil { logger.WithField("inner_err", err).Warningln("failed to get pods from reality store") } return updatePods(watchedPods, reality, logger, store, node) }
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger) err := pod.Install(pair.Intent) if err != nil { // install failed, abort and retry logger.WithError(err).Errorln("Install failed") return false } err = pod.Verify(pair.Intent, p.authPolicy) if err != nil { logger.WithError(err). Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger) if pair.Reality != nil { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err). Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger) ok, err := pod.Launch(pair.Intent) if err != nil { logger.WithError(err). Errorln("Launch failed") } else { duration, err := p.store.SetPod(kp.RealityPath(p.node, pair.ID), pair.Intent) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{ "duration": duration}). Errorln("Could not set pod in reality store") } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger) } return err == nil && ok }
func main() { kingpin.Version(version.VERSION) kingpin.Parse() store := kp.NewConsulStore(kp.Options{ Address: *consulAddress, Token: *consulToken, Client: net.NewHeaderClient(*headers, http.DefaultTransport), HTTPS: *https, }) if *nodeName == "" { hostname, err := os.Hostname() if err != nil { log.Fatalf("Could not get the hostname to do scheduling: %s", err) } *nodeName = hostname } path := kp.IntentPath(*nodeName) if *watchReality { path = kp.RealityPath(*nodeName) } else if *hookTypeName != "" { hookType, err := hooks.AsHookType(*hookTypeName) if err != nil { log.Fatalln(err) } path = kp.HookPath(hookType, *nodeName) } log.Printf("Watching manifests at %s\n", path) quit := make(chan struct{}) errChan := make(chan error) podCh := make(chan kp.ManifestResult) go store.WatchPods(path, quit, errChan, podCh) for { select { case result := <-podCh: fmt.Println("") result.Manifest.Write(os.Stdout) case err := <-errChan: log.Fatalf("Error occurred while listening to pods: %s", err) } } }
func ScheduleForThisHost(manifest pods.Manifest, alsoReality bool) error { store := kp.NewConsulStore(kp.NewConsulClient(kp.Options{ Token: *consulToken, })) hostname, err := os.Hostname() if err != nil { return err } _, err = store.SetPod(kp.IntentPath(hostname, manifest.ID()), manifest) if err != nil { return err } if alsoReality { _, err = store.SetPod(kp.RealityPath(hostname, manifest.ID()), manifest) return err } return nil }
func main() { kingpin.Version(version.VERSION) _, opts := flags.ParseWithConsulOptions() client := kp.NewConsulClient(opts) store := kp.NewConsulStore(client) if *nodeName == "" { hostname, err := os.Hostname() if err != nil { log.Fatalf("Could not get the hostname to do scheduling: %s", err) } *nodeName = hostname } path := kp.IntentPath(*nodeName) if *watchReality { path = kp.RealityPath(*nodeName) } else if *hooks { path = kp.HookPath() } log.Printf("Watching manifests at %s\n", path) quit := make(chan struct{}) errChan := make(chan error) podCh := make(chan []kp.ManifestResult) go store.WatchPods(path, quit, errChan, podCh) for { select { case results := <-podCh: if len(results) == 0 { fmt.Println(fmt.Sprintf("No manifest exists at key %s (it may have been deleted)", path)) } else { for _, result := range results { fmt.Println("") result.Manifest.Write(os.Stdout) } } case err := <-errChan: log.Fatalf("Error occurred while listening to pods: %s", err) } } }
// MonitorPodHealth is meant to be a long running go routine. // MonitorPodHealth reads from a consul store to determine which // services should be running on the host. MonitorPodHealth // runs a CheckHealth routine to monitor the health of each // service and kills routines for services that should no // longer be running. func MonitorPodHealth(config *preparer.PreparerConfig, logger *logging.Logger, shutdownCh chan struct{}) { store, err := config.GetStore() if err != nil { // A bad config should have already produced a nice, user-friendly error message. logger.WithError(err).Fatalln("error creating health monitor KV store") } healthManager := store.NewHealthManager(config.NodeName, *logger) // if GetClient fails it means the certfile/keyfile/cafile were // invalid or did not exist. It makes sense to throw a fatal error client, err := config.GetClient() if err != nil { logger.WithError(err).Fatalln("failed to get http client for this preparer") } node := config.NodeName pods := []PodWatch{} watchQuitCh := make(chan struct{}) watchErrCh := make(chan error) watchPodCh := make(chan []kp.ManifestResult) go store.WatchPods(kp.RealityPath(node), watchQuitCh, watchErrCh, watchPodCh) for { select { case results := <-watchPodCh: // check if pods have been added or removed // starts monitor routine for new pods // kills monitor routine for removed pods pods = updatePods(healthManager, client, pods, results, node, logger) case err := <-watchErrCh: logger.WithError(err).Errorln("there was an error reading reality manifests for health monitor") case <-shutdownCh: for _, pod := range pods { pod.shutdownCh <- true } healthManager.Close() close(watchQuitCh) return } } }
func (p *Preparer) stopAndUninstallPod(pair ManifestPair, pod Pod, logger logging.Logger) bool { success, err := pod.Halt(pair.Reality) if err != nil { logger.WithError(err).Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } p.tryRunHooks(hooks.BEFORE_UNINSTALL, pod, pair.Reality, logger) err = pod.Uninstall() if err != nil { logger.WithError(err).Errorln("Uninstall failed") return false } logger.NoFields().Infoln("Successfully uninstalled") dur, err := p.store.DeletePod(kp.RealityPath(p.node, pair.ID)) if err != nil { logger.WithErrorAndFields(err, logrus.Fields{"duration": dur}). Errorln("Could not delete pod from reality store") } return true }
// note: logging should be delegated somehow func (r Replicator) updateOne(node string, done chan<- string, errCh chan<- error, quitCh <-chan struct{}) { targetSHA, _ := r.Manifest.SHA() nodeLogger := r.Logger.SubLogger(logrus.Fields{"node": node}) nodeLogger.WithField("sha", targetSHA).Infoln("Updating node") _, err := r.Store.SetPod(kp.IntentPath(node, r.Manifest.ID()), r.Manifest) for err != nil { nodeLogger.WithField("err", err).Errorln("Could not write intent store") errCh <- err time.Sleep(1 * time.Second) _, err = r.Store.SetPod(kp.IntentPath(node, r.Manifest.ID()), r.Manifest) } realityResults := make(chan kp.ManifestResult) realityErr := make(chan error) realityQuit := make(chan struct{}) defer close(realityQuit) go r.Store.WatchPods(kp.RealityPath(node, r.Manifest.ID()), realityQuit, realityErr, realityResults) REALITY_LOOP: for { select { case <-quitCh: return case err := <-realityErr: nodeLogger.WithField("err", err).Errorln("Could not read reality store") errCh <- err case mResult := <-realityResults: receivedSHA, _ := mResult.Manifest.SHA() if receivedSHA == targetSHA { break REALITY_LOOP } else { nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current") } } } nodeLogger.NoFields().Infoln("Node is current") healthResults := make(chan []health.Result) healthErr := make(chan error) healthQuit := make(chan struct{}) defer close(healthQuit) go r.Health.WatchNodeService(node, r.Manifest.ID(), healthResults, healthErr, healthQuit) HEALTH_LOOP: for { select { case <-quitCh: return case err := <-healthErr: nodeLogger.WithField("err", err).Errorln("Could not read health check") errCh <- err case res := <-healthResults: id, status := health.FindWorst(res) // treat an empty threshold as "passing" threshold := health.Passing if r.Threshold != "" { threshold = r.Threshold } // is this status less than the threshold? if health.Compare(status, threshold) < 0 { nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy") } else { break HEALTH_LOOP } } } r.Logger.WithField("node", node).Infoln("Node is current and healthy") select { case done <- node: case <-quitCh: } }
func (p *Preparer) installAndLaunchPod(newManifest *pods.Manifest, pod Pod, logger logging.Logger) bool { // do not remove the logger argument, it's not the same as p.Logger // get currently running pod to compare with the new pod realityPath := kp.RealityPath(p.node, newManifest.ID()) currentManifest, _, err := p.store.Pod(realityPath) currentSHA := "" if currentManifest != nil { currentSHA, _ = currentManifest.SHA() } newSHA, _ := newManifest.SHA() // if new or the manifest is different, launch newOrDifferent := (err == pods.NoCurrentManifest) || (currentSHA != newSHA) if newOrDifferent { logger.WithFields(logrus.Fields{ "old_sha": currentSHA, "sha": newSHA, "pod": newManifest.ID(), }).Infoln("SHA is new or different from old, will update") } // if the old manifest is corrupted somehow, re-launch since we don't know if this is an update. problemReadingCurrentManifest := (err != nil && err != pods.NoCurrentManifest) if problemReadingCurrentManifest { logger.WithFields(logrus.Fields{ "sha": newSHA, "inner_err": err, }).Errorln("Current manifest not readable, will relaunch") } if newOrDifferent || problemReadingCurrentManifest { p.tryRunHooks(hooks.BEFORE_INSTALL, pod, newManifest, logger) err = pod.Install(newManifest) if err != nil { // install failed, abort and retry logger.WithFields(logrus.Fields{ "err": err, }).Errorln("Install failed") return false } err = pod.Verify(newManifest, p.authPolicy) if err != nil { logger.WithField("err", err).Errorln("Pod digest verification failed") p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, newManifest, logger) return false } p.tryRunHooks(hooks.AFTER_INSTALL, pod, newManifest, logger) err = p.store.RegisterService(*newManifest, p.caPath) if err != nil { logger.WithField("err", err).Errorln("Service registration failed") return false } if currentManifest != nil { success, err := pod.Halt(currentManifest) if err != nil { logger.WithField("err", err).Errorln("Pod halt failed") } else if !success { logger.NoFields().Warnln("One or more launchables did not halt successfully") } } ok, err := pod.Launch(newManifest) if err != nil { logger.WithFields(logrus.Fields{ "err": err, }).Errorln("Launch failed") } else { duration, err := p.store.SetPod(realityPath, *newManifest) if err != nil { logger.WithFields(logrus.Fields{ "err": err, "duration": duration, }).Errorln("Could not set pod in reality store") } p.tryRunHooks(hooks.AFTER_LAUNCH, pod, newManifest, logger) } return err == nil && ok } // TODO: shut down removed launchables between pod versions. return true }
func (p *Preparer) WatchForPodManifestsForNode(quitAndAck chan struct{}) { pods.Log = p.Logger path := kp.IntentPath(p.node) // This allows us to signal the goroutine watching consul to quit quitChan := make(chan struct{}) errChan := make(chan error) podChan := make(chan []kp.ManifestResult) go p.store.WatchPods(path, quitChan, errChan, podChan) // we will have one long running goroutine for each app installed on this // host. We keep a map of podId => podChan so we can send the new manifests // that come in to the appropriate goroutine podChanMap := make(map[string]chan ManifestPair) // we can't use a shared quit channel for all the goroutines - otherwise, // we would exit the program before the goroutines actually accepted the // quit signal. to be sure that each goroutine is done, we have to block and // wait for it to receive the signal quitChanMap := make(map[string]chan struct{}) for { select { case err := <-errChan: p.Logger.WithError(err). Errorln("there was an error reading the manifest") case intentResults := <-podChan: realityResults, _, err := p.store.ListPods(kp.RealityPath(p.node)) if err != nil { p.Logger.WithError(err).Errorln("Could not check reality") } else { // if the preparer's own ID is missing from the intent set, we // assume it was damaged and discard it if !checkResultsForID(intentResults, POD_ID) { p.Logger.NoFields().Errorln("Intent results set did not contain p2-preparer pod ID, consul data may be corrupted") } else { resultPairs := ZipResultSets(intentResults, realityResults) for _, pair := range resultPairs { if _, ok := podChanMap[pair.ID]; !ok { // spin goroutine for this pod podChanMap[pair.ID] = make(chan ManifestPair) quitChanMap[pair.ID] = make(chan struct{}) go p.handlePods(podChanMap[pair.ID], quitChanMap[pair.ID]) } podChanMap[pair.ID] <- pair } } } case <-quitAndAck: for podToQuit, quitCh := range quitChanMap { p.Logger.WithField("pod", podToQuit).Infoln("Quitting...") quitCh <- struct{}{} } close(quitChan) p.Logger.NoFields().Infoln("Done, acknowledging quit") quitAndAck <- struct{}{} // acknowledge quit return } } }
// note: logging should be delegated somehow func (r replication) updateOne(node string, done chan<- string, quitCh <-chan struct{}) { targetSHA, _ := r.manifest.SHA() nodeLogger := r.logger.SubLogger(logrus.Fields{"node": node}) nodeLogger.WithField("sha", targetSHA).Infoln("Updating node") _, err := r.store.SetPod(kp.IntentPath(node, r.manifest.ID()), r.manifest) for err != nil { nodeLogger.WithError(err).Errorln("Could not write intent store") r.errCh <- err time.Sleep(1 * time.Second) _, err = r.store.SetPod(kp.IntentPath(node, r.manifest.ID()), r.manifest) } realityResults := make(chan []kp.ManifestResult) realityErr := make(chan error) realityQuit := make(chan struct{}) defer close(realityQuit) go r.store.WatchPods(kp.RealityPath(node, r.manifest.ID()), realityQuit, realityErr, realityResults) REALITY_LOOP: for { select { case <-quitCh: return case err := <-realityErr: nodeLogger.WithError(err).Errorln("Could not read reality store") select { case r.errCh <- err: case <-quitCh: } case mResult := <-realityResults: // We expect len(mResult) == 0 if the pod key doesn't // exist yet, that's okay just wait longer if len(mResult) == 1 { receivedSHA, _ := mResult[0].Manifest.SHA() if receivedSHA == targetSHA { break REALITY_LOOP } else { nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current") } } else if len(mResult) > 1 { nodeLogger.WithField("n", len(mResult)).Errorf("Got %d results from reality but was expecting only 1", len(mResult)) } } } nodeLogger.NoFields().Infoln("Node is current") healthResults := make(chan health.Result) healthErr := make(chan error) healthQuit := make(chan struct{}) defer close(healthQuit) go r.health.WatchNodeService(node, r.manifest.ID(), healthResults, healthErr, healthQuit) HEALTH_LOOP: for { select { case <-quitCh: return case err := <-healthErr: nodeLogger.WithError(err).Errorln("Could not read health check") select { case r.errCh <- err: case <-quitCh: } case res := <-healthResults: id := res.ID status := res.Status // treat an empty threshold as "passing" threshold := health.Passing if r.threshold != "" { threshold = r.threshold } // is this status less than the threshold? if health.Compare(status, threshold) < 0 { nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy") } else { break HEALTH_LOOP } } } r.logger.WithField("node", node).Infoln("Node is current and healthy") select { case done <- node: case <-quitCh: } }