func (r *replication) ensureHealthy( node types.NodeName, timeoutCh <-chan struct{}, nodeLogger logging.Logger, aggregateHealth *podHealth, ) error { for { select { case <-r.quitCh: r.logger.Infoln("Caught quit signal during ensureHealthy") return errQuit case <-timeoutCh: r.logger.Infoln("Caught node timeout signal during ensureHealthy") return errTimeout case <-r.replicationCancelledCh: r.logger.Infoln("Caught cancellation signal during ensureHealthy") return errCancelled case <-time.After(time.Duration(*ensureHealthyPeriodMillis) * time.Millisecond): res, ok := aggregateHealth.GetHealth(node) if !ok { nodeLogger.WithFields(logrus.Fields{ "node": node, }).Errorln("Could not get health, retrying") // Zero res should be treated like "critical" } id := res.ID status := res.Status // treat an empty threshold as "passing" threshold := health.Passing if r.threshold != "" { threshold = r.threshold } // is this status less than the threshold? if health.Compare(status, threshold) < 0 { nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy") } else { r.logger.WithField("node", node).Infoln("Node is current and healthy") return nil } } } }
// note: logging should be delegated somehow func (r Replicator) updateOne(node string, done chan<- string, errCh chan<- error, quitCh <-chan struct{}) { targetSHA, _ := r.Manifest.SHA() nodeLogger := r.Logger.SubLogger(logrus.Fields{"node": node}) nodeLogger.WithField("sha", targetSHA).Infoln("Updating node") _, err := r.Store.SetPod(kp.IntentPath(node, r.Manifest.ID()), r.Manifest) for err != nil { nodeLogger.WithField("err", err).Errorln("Could not write intent store") errCh <- err time.Sleep(1 * time.Second) _, err = r.Store.SetPod(kp.IntentPath(node, r.Manifest.ID()), r.Manifest) } realityResults := make(chan kp.ManifestResult) realityErr := make(chan error) realityQuit := make(chan struct{}) defer close(realityQuit) go r.Store.WatchPods(kp.RealityPath(node, r.Manifest.ID()), realityQuit, realityErr, realityResults) REALITY_LOOP: for { select { case <-quitCh: return case err := <-realityErr: nodeLogger.WithField("err", err).Errorln("Could not read reality store") errCh <- err case mResult := <-realityResults: receivedSHA, _ := mResult.Manifest.SHA() if receivedSHA == targetSHA { break REALITY_LOOP } else { nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current") } } } nodeLogger.NoFields().Infoln("Node is current") healthResults := make(chan []health.Result) healthErr := make(chan error) healthQuit := make(chan struct{}) defer close(healthQuit) go r.Health.WatchNodeService(node, r.Manifest.ID(), healthResults, healthErr, healthQuit) HEALTH_LOOP: for { select { case <-quitCh: return case err := <-healthErr: nodeLogger.WithField("err", err).Errorln("Could not read health check") errCh <- err case res := <-healthResults: id, status := health.FindWorst(res) // treat an empty threshold as "passing" threshold := health.Passing if r.Threshold != "" { threshold = r.Threshold } // is this status less than the threshold? if health.Compare(status, threshold) < 0 { nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy") } else { break HEALTH_LOOP } } } r.Logger.WithField("node", node).Infoln("Node is current and healthy") select { case done <- node: case <-quitCh: } }
// note: logging should be delegated somehow func (r replication) updateOne(node string, done chan<- string, quitCh <-chan struct{}) { targetSHA, _ := r.manifest.SHA() nodeLogger := r.logger.SubLogger(logrus.Fields{"node": node}) nodeLogger.WithField("sha", targetSHA).Infoln("Updating node") _, err := r.store.SetPod(kp.IntentPath(node, r.manifest.ID()), r.manifest) for err != nil { nodeLogger.WithError(err).Errorln("Could not write intent store") r.errCh <- err time.Sleep(1 * time.Second) _, err = r.store.SetPod(kp.IntentPath(node, r.manifest.ID()), r.manifest) } realityResults := make(chan []kp.ManifestResult) realityErr := make(chan error) realityQuit := make(chan struct{}) defer close(realityQuit) go r.store.WatchPods(kp.RealityPath(node, r.manifest.ID()), realityQuit, realityErr, realityResults) REALITY_LOOP: for { select { case <-quitCh: return case err := <-realityErr: nodeLogger.WithError(err).Errorln("Could not read reality store") select { case r.errCh <- err: case <-quitCh: } case mResult := <-realityResults: // We expect len(mResult) == 0 if the pod key doesn't // exist yet, that's okay just wait longer if len(mResult) == 1 { receivedSHA, _ := mResult[0].Manifest.SHA() if receivedSHA == targetSHA { break REALITY_LOOP } else { nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current") } } else if len(mResult) > 1 { nodeLogger.WithField("n", len(mResult)).Errorf("Got %d results from reality but was expecting only 1", len(mResult)) } } } nodeLogger.NoFields().Infoln("Node is current") healthResults := make(chan health.Result) healthErr := make(chan error) healthQuit := make(chan struct{}) defer close(healthQuit) go r.health.WatchNodeService(node, r.manifest.ID(), healthResults, healthErr, healthQuit) HEALTH_LOOP: for { select { case <-quitCh: return case err := <-healthErr: nodeLogger.WithError(err).Errorln("Could not read health check") select { case r.errCh <- err: case <-quitCh: } case res := <-healthResults: id := res.ID status := res.Status // treat an empty threshold as "passing" threshold := health.Passing if r.threshold != "" { threshold = r.threshold } // is this status less than the threshold? if health.Compare(status, threshold) < 0 { nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy") } else { break HEALTH_LOOP } } } r.logger.WithField("node", node).Infoln("Node is current and healthy") select { case done <- node: case <-quitCh: } }