Example #1
0
func (r *replication) ensureHealthy(
	node types.NodeName,
	timeoutCh <-chan struct{},
	nodeLogger logging.Logger,
	aggregateHealth *podHealth,
) error {
	for {
		select {
		case <-r.quitCh:
			r.logger.Infoln("Caught quit signal during ensureHealthy")
			return errQuit
		case <-timeoutCh:
			r.logger.Infoln("Caught node timeout signal during ensureHealthy")
			return errTimeout
		case <-r.replicationCancelledCh:
			r.logger.Infoln("Caught cancellation signal during ensureHealthy")
			return errCancelled
		case <-time.After(time.Duration(*ensureHealthyPeriodMillis) * time.Millisecond):
			res, ok := aggregateHealth.GetHealth(node)
			if !ok {
				nodeLogger.WithFields(logrus.Fields{
					"node": node,
				}).Errorln("Could not get health, retrying")
				// Zero res should be treated like "critical"
			}
			id := res.ID
			status := res.Status
			// treat an empty threshold as "passing"
			threshold := health.Passing
			if r.threshold != "" {
				threshold = r.threshold
			}
			// is this status less than the threshold?
			if health.Compare(status, threshold) < 0 {
				nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy")
			} else {
				r.logger.WithField("node", node).Infoln("Node is current and healthy")
				return nil
			}
		}
	}
}
Example #2
0
// note: logging should be delegated somehow
func (r Replicator) updateOne(node string, done chan<- string, errCh chan<- error, quitCh <-chan struct{}) {
	targetSHA, _ := r.Manifest.SHA()
	nodeLogger := r.Logger.SubLogger(logrus.Fields{"node": node})
	nodeLogger.WithField("sha", targetSHA).Infoln("Updating node")

	_, err := r.Store.SetPod(kp.IntentPath(node, r.Manifest.ID()), r.Manifest)
	for err != nil {
		nodeLogger.WithField("err", err).Errorln("Could not write intent store")
		errCh <- err
		time.Sleep(1 * time.Second)
		_, err = r.Store.SetPod(kp.IntentPath(node, r.Manifest.ID()), r.Manifest)
	}

	realityResults := make(chan kp.ManifestResult)
	realityErr := make(chan error)
	realityQuit := make(chan struct{})
	defer close(realityQuit)
	go r.Store.WatchPods(kp.RealityPath(node, r.Manifest.ID()), realityQuit, realityErr, realityResults)
REALITY_LOOP:
	for {
		select {
		case <-quitCh:
			return
		case err := <-realityErr:
			nodeLogger.WithField("err", err).Errorln("Could not read reality store")
			errCh <- err
		case mResult := <-realityResults:
			receivedSHA, _ := mResult.Manifest.SHA()
			if receivedSHA == targetSHA {
				break REALITY_LOOP
			} else {
				nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current")
			}
		}
	}
	nodeLogger.NoFields().Infoln("Node is current")

	healthResults := make(chan []health.Result)
	healthErr := make(chan error)
	healthQuit := make(chan struct{})
	defer close(healthQuit)
	go r.Health.WatchNodeService(node, r.Manifest.ID(), healthResults, healthErr, healthQuit)
HEALTH_LOOP:
	for {
		select {
		case <-quitCh:
			return
		case err := <-healthErr:
			nodeLogger.WithField("err", err).Errorln("Could not read health check")
			errCh <- err
		case res := <-healthResults:
			id, status := health.FindWorst(res)
			// treat an empty threshold as "passing"
			threshold := health.Passing
			if r.Threshold != "" {
				threshold = r.Threshold
			}
			// is this status less than the threshold?
			if health.Compare(status, threshold) < 0 {
				nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy")
			} else {
				break HEALTH_LOOP
			}
		}
	}
	r.Logger.WithField("node", node).Infoln("Node is current and healthy")

	select {
	case done <- node:
	case <-quitCh:
	}
}
Example #3
0
// note: logging should be delegated somehow
func (r replication) updateOne(node string, done chan<- string, quitCh <-chan struct{}) {
	targetSHA, _ := r.manifest.SHA()
	nodeLogger := r.logger.SubLogger(logrus.Fields{"node": node})
	nodeLogger.WithField("sha", targetSHA).Infoln("Updating node")

	_, err := r.store.SetPod(kp.IntentPath(node, r.manifest.ID()), r.manifest)
	for err != nil {
		nodeLogger.WithError(err).Errorln("Could not write intent store")
		r.errCh <- err
		time.Sleep(1 * time.Second)
		_, err = r.store.SetPod(kp.IntentPath(node, r.manifest.ID()), r.manifest)
	}

	realityResults := make(chan []kp.ManifestResult)
	realityErr := make(chan error)
	realityQuit := make(chan struct{})
	defer close(realityQuit)
	go r.store.WatchPods(kp.RealityPath(node, r.manifest.ID()), realityQuit, realityErr, realityResults)
REALITY_LOOP:
	for {
		select {
		case <-quitCh:
			return
		case err := <-realityErr:
			nodeLogger.WithError(err).Errorln("Could not read reality store")
			select {
			case r.errCh <- err:
			case <-quitCh:
			}
		case mResult := <-realityResults:
			// We expect len(mResult) == 0 if the pod key doesn't
			// exist yet, that's okay just wait longer
			if len(mResult) == 1 {
				receivedSHA, _ := mResult[0].Manifest.SHA()
				if receivedSHA == targetSHA {
					break REALITY_LOOP
				} else {
					nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current")
				}
			} else if len(mResult) > 1 {
				nodeLogger.WithField("n", len(mResult)).Errorf("Got %d results from reality but was expecting only 1", len(mResult))
			}
		}
	}
	nodeLogger.NoFields().Infoln("Node is current")

	healthResults := make(chan health.Result)
	healthErr := make(chan error)
	healthQuit := make(chan struct{})
	defer close(healthQuit)
	go r.health.WatchNodeService(node, r.manifest.ID(), healthResults, healthErr, healthQuit)
HEALTH_LOOP:
	for {
		select {
		case <-quitCh:
			return
		case err := <-healthErr:
			nodeLogger.WithError(err).Errorln("Could not read health check")
			select {
			case r.errCh <- err:
			case <-quitCh:
			}
		case res := <-healthResults:
			id := res.ID
			status := res.Status
			// treat an empty threshold as "passing"
			threshold := health.Passing
			if r.threshold != "" {
				threshold = r.threshold
			}
			// is this status less than the threshold?
			if health.Compare(status, threshold) < 0 {
				nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy")
			} else {
				break HEALTH_LOOP
			}
		}
	}
	r.logger.WithField("node", node).Infoln("Node is current and healthy")

	select {
	case done <- node:
	case <-quitCh:
	}
}