// MonitorPodHealth is meant to be a long running go routine. // MonitorPodHealth reads from a consul store to determine which // services should be running on the host. MonitorPodHealth // runs a CheckHealth routine to monitor the health of each // service and kills routines for services that should no // longer be running. func MonitorPodHealth(config *preparer.PreparerConfig, logger *logging.Logger, shutdownCh chan struct{}) { store, err := config.GetStore() if err != nil { // A bad config should have already produced a nice, user-friendly error message. logger.WithError(err).Fatalln("error creating health monitor KV store") } healthManager := store.NewHealthManager(config.NodeName, *logger) node := config.NodeName pods := []PodWatch{} watchQuitCh := make(chan struct{}) watchErrCh := make(chan error) watchPodCh := make(chan []kp.ManifestResult) go store.WatchPods( kp.REALITY_TREE, node, watchQuitCh, watchErrCh, watchPodCh, ) // if GetClient fails it means the certfile/keyfile/cafile were // invalid or did not exist. It makes sense to throw a fatal error secureClient, err := config.GetClient(time.Duration(*HEALTHCHECK_TIMEOUT) * time.Second) if err != nil { logger.WithError(err).Fatalln("failed to get http client for this preparer") } insecureClient, err := config.GetInsecureClient(time.Duration(*HEALTHCHECK_TIMEOUT) * time.Second) if err != nil { logger.WithError(err).Fatalln("failed to get http client for this preparer") } for { select { case results := <-watchPodCh: // check if pods have been added or removed // starts monitor routine for new pods // kills monitor routine for removed pods pods = updatePods(healthManager, secureClient, insecureClient, pods, results, node, logger) case err := <-watchErrCh: logger.WithError(err).Errorln("there was an error reading reality manifests for health monitor") case <-shutdownCh: for _, pod := range pods { pod.shutdownCh <- true } healthManager.Close() close(watchQuitCh) return } } }
func verifyHealthChecks(config *preparer.PreparerConfig, services []string) error { store, err := config.GetStore() if err != nil { return err } time.Sleep(5 * time.Second) // check consul for health information for each app name, err := os.Hostname() if err != nil { return err } node := types.NodeName(name) for _, sv := range services { res, err := store.GetHealth(sv, node) if err != nil { return err } else if (res == kp.WatchResult{}) { return fmt.Errorf("No results for %s: \n\n %s", sv, targetLogs()) } else if res.Status != string(health.Passing) { return fmt.Errorf("%s did not pass health check: \n\n %s", sv, targetLogs()) } else { fmt.Println(res) } } for _, sv := range services { res, err := store.GetServiceHealth(sv) getres, _ := store.GetHealth(sv, node) if err != nil { return err } val := res[kp.HealthPath(sv, node)] if getres.Id != val.Id || getres.Service != val.Service || getres.Status != val.Status { return fmt.Errorf("GetServiceHealth failed %+v: \n\n%s", res, targetLogs()) } } // if it reaches here it means health checks // are being written to the KV store properly return nil }