Example #1
0
File: main.go Project: tomzhang/p2
func main() {
	kingpin.Version(version.VERSION)
	_, opts := flags.ParseWithConsulOptions()
	client := kp.NewConsulClient(opts)
	store := kp.NewConsulStore(client)

	if *nodeName == "" {
		hostname, err := os.Hostname()
		if err != nil {
			log.Fatalf("Could not get the hostname to do scheduling: %s", err)
		}
		*nodeName = hostname
	}

	if len(*manifests) == 0 {
		kingpin.Usage()
		log.Fatalln("No manifests given")
	}

	for _, manifestPath := range *manifests {
		manifest, err := pods.ManifestFromPath(manifestPath)
		if err != nil {
			log.Fatalf("Could not read manifest at %s: %s\n", manifestPath, err)
		}
		path := kp.IntentPath(*nodeName, manifest.ID())
		if *hookGlobal {
			path = kp.HookPath(manifest.ID())
		}
		duration, err := store.SetPod(path, manifest)
		if err != nil {
			log.Fatalf("Could not write manifest %s to intent store: %s\n", manifest.ID(), err)
		}
		log.Printf("Scheduling %s took %s\n", manifest.ID(), duration)
	}
}
Example #2
0
func TestInitializeReplicationCanOverrideLocks(t *testing.T) {
	replicator, store, server := testReplicatorAndServer(t)
	defer server.Stop()

	// This makes it look like the preparers are installed on the hosts
	// we're deploying to
	for _, node := range testNodes {
		key := fmt.Sprintf("reality/%s/p2-preparer", node)
		server.SetKV(key, []byte(testPreparerManifest))
	}

	// Claim a lock on a host and verify that InitializeReplication fails
	lock, _, err := store.NewLock("competing lock", nil)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)
	}
	defer lock.Destroy()
	lockPath := kp.LockPath(kp.IntentPath(testNodes[0], testPodId))
	err = lock.Lock(lockPath)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)
	}

	replication, _, err := replicator.InitializeReplication(true)
	if err != nil {
		t.Fatalf("Expected InitializeReplication to override competing lock, but error occured: %s", err)
	}
	replication.Cancel()
}
Example #3
0
func TestInitializeReplication(t *testing.T) {
	replicator, store, server := testReplicatorAndServer(t)
	defer server.Stop()

	// Make the kv store look like preparer is installed on test nodes
	setupPreparers(server)

	// err being nil ensures that checking preparers and locking the hosts
	// succeeded
	replication, _, err := replicator.InitializeReplication(false)
	if err != nil {
		t.Fatalf("Error initializing replication: %s", err)
	}
	defer replication.Cancel()

	// Confirm that the appropriate kv keys have been locked
	for _, node := range testNodes {
		lockPath := kp.LockPath(kp.IntentPath(node, testPodId))
		lockHolder, _, err := store.LockHolder(lockPath)
		if err != nil {
			t.Fatalf("Unexpected error checking for lock holder: %s", err)
		}

		if lockHolder != testLockMessage {
			t.Errorf("Expected lock holder for key '%s' to be '%s', was '%s'", lockPath, testLockMessage, lockHolder)
		}
	}
}
Example #4
0
func (rc *replicationController) schedule(node string) error {
	// First, schedule the new pod.
	intentPath := kp.IntentPath(node, rc.Manifest.ID())
	rc.logger.NoFields().Infof("Scheduling on %s", intentPath)
	_, err := rc.kpStore.SetPod(intentPath, rc.Manifest)
	if err != nil {
		return err
	}

	// Then we set the labels.
	// TODO: It could be the case that these SetLabel fail.
	// In this case, we've scheduled a pod but currentNodes() will never be aware of it.
	// We could consider cleaning the now-orphaned pod up?
	return rc.forEachLabel(node, func(id, k, v string) error {
		return rc.podApplicator.SetLabel(labels.POD, id, k, v)
	})
}
Example #5
0
func main() {
	kingpin.Version(version.VERSION)
	kingpin.Parse()

	store := kp.NewConsulStore(kp.Options{
		Address: *consulAddress,
		Token:   *consulToken,
		Client:  net.NewHeaderClient(*headers, http.DefaultTransport),
		HTTPS:   *https,
	})

	if *nodeName == "" {
		hostname, err := os.Hostname()
		if err != nil {
			log.Fatalf("Could not get the hostname to do scheduling: %s", err)
		}
		*nodeName = hostname
	}

	path := kp.IntentPath(*nodeName)
	if *watchReality {
		path = kp.RealityPath(*nodeName)
	} else if *hookTypeName != "" {
		hookType, err := hooks.AsHookType(*hookTypeName)
		if err != nil {
			log.Fatalln(err)
		}
		path = kp.HookPath(hookType, *nodeName)
	}
	log.Printf("Watching manifests at %s\n", path)

	quit := make(chan struct{})
	errChan := make(chan error)
	podCh := make(chan kp.ManifestResult)
	go store.WatchPods(path, quit, errChan, podCh)
	for {
		select {
		case result := <-podCh:
			fmt.Println("")
			result.Manifest.Write(os.Stdout)
		case err := <-errChan:
			log.Fatalf("Error occurred while listening to pods: %s", err)
		}
	}
}
Example #6
0
func (rc *replicationController) unschedule(node string) error {
	intentPath := kp.IntentPath(node, rc.Manifest.ID())
	rc.logger.NoFields().Infof("Uncheduling from %s", intentPath)

	// TODO: As above in schedule, it could be the case that RemoveLabel fails.
	// This again means that currentNodes() is no longer aware of a pod.
	// But the pod is still running (we haven't called DeletePod yet)
	// So, the pod gets orphaned. Any way to clean it up?
	err := rc.forEachLabel(node, func(id, k, _ string) error {
		return rc.podApplicator.RemoveLabel(labels.POD, id, k)
	})
	if err != nil {
		return err
	}

	_, err = rc.kpStore.DeletePod(intentPath)
	return err
}
Example #7
0
// Attempts to claim a lock on every host being deployed to.
// if overrideLock is true, will destroy any session holding any of the keys we
// wish to lock
func (r replication) lockHosts(overrideLock bool, lockMessage string) error {
	lock, renewalErrCh, err := r.store.NewLock(lockMessage, nil)
	if err != nil {
		return err
	}

	for _, host := range r.nodes {
		lockPath := kp.LockPath(kp.IntentPath(host, r.manifest.ID()))
		err := r.lock(lock, lockPath, overrideLock)

		if err != nil {
			return err
		}
	}
	go r.handleRenewalErrors(lock, renewalErrCh)

	return nil
}
Example #8
0
func (p *Preparer) WatchForPodManifestsForNode(quitAndAck chan struct{}) {
	pods.Log = p.Logger
	path := kp.IntentPath(p.node)

	// This allows us to signal the goroutine watching consul to quit
	watcherQuit := make(<-chan struct{})
	errChan := make(chan error)
	podChan := make(chan kp.ManifestResult)

	go p.store.WatchPods(path, watcherQuit, errChan, podChan)

	// we will have one long running goroutine for each app installed on this
	// host. We keep a map of podId => podChan so we can send the new manifests
	// that come in to the appropriate goroutine
	podChanMap := make(map[string]chan pods.Manifest)
	quitChanMap := make(map[string]chan struct{})

	for {
		select {
		case err := <-errChan:
			p.Logger.WithFields(logrus.Fields{
				"inner_err": err,
			}).Errorln("there was an error reading the manifest")
		case result := <-podChan:
			podId := result.Manifest.ID()
			if podChanMap[podId] == nil {
				// No goroutine is servicing this app currently, let's start one
				podChanMap[podId] = make(chan pods.Manifest)
				quitChanMap[podId] = make(chan struct{})
				go p.handlePods(podChanMap[podId], quitChanMap[podId])
			}
			podChanMap[podId] <- result.Manifest
		case <-quitAndAck:
			for podToQuit, quitCh := range quitChanMap {
				p.Logger.WithField("pod", podToQuit).Infoln("Quitting...")
				quitCh <- struct{}{}
			}
			p.Logger.NoFields().Infoln("Done, acknowledging quit")
			quitAndAck <- struct{}{} // acknowledge quit
			return
		}

	}
}
Example #9
0
func main() {
	kingpin.Version(version.VERSION)
	kingpin.Parse()

	store := kp.NewConsulStore(kp.Options{
		Address: *consulAddress,
		Token:   *consulToken,
		Client:  net.NewHeaderClient(*headers, http.DefaultTransport),
		HTTPS:   *https,
	})

	if *nodeName == "" {
		hostname, err := os.Hostname()
		if err != nil {
			log.Fatalf("Could not get the hostname to do scheduling: %s", err)
		}
		*nodeName = hostname
	}

	if len(*manifests) == 0 {
		kingpin.Usage()
		log.Fatalln("No manifests given")
	}

	for _, manifestPath := range *manifests {
		manifest, err := pods.ManifestFromPath(manifestPath)
		if err != nil {
			log.Fatalf("Could not read manifest at %s: %s\n", manifestPath, err)
		}
		path := kp.IntentPath(*nodeName, manifest.ID())
		if *hookTypeName != "" {
			hookType, err := hooks.AsHookType(*hookTypeName)
			if err != nil {
				log.Fatalln(err)
			}
			path = kp.HookPath(hookType, manifest.ID())
		}
		duration, err := store.SetPod(path, *manifest)
		if err != nil {
			log.Fatalf("Could not write manifest %s to intent store: %s\n", manifest.ID(), err)
		}
		log.Printf("Scheduling %s took %s\n", manifest.ID(), duration)
	}
}
Example #10
0
func ScheduleForThisHost(manifest pods.Manifest, alsoReality bool) error {
	store := kp.NewConsulStore(kp.NewConsulClient(kp.Options{
		Token: *consulToken,
	}))
	hostname, err := os.Hostname()
	if err != nil {
		return err
	}
	_, err = store.SetPod(kp.IntentPath(hostname, manifest.ID()), manifest)
	if err != nil {
		return err
	}

	if alsoReality {
		_, err = store.SetPod(kp.RealityPath(hostname, manifest.ID()), manifest)
		return err
	}
	return nil
}
Example #11
0
File: main.go Project: tomzhang/p2
func main() {
	kingpin.Version(version.VERSION)
	_, opts := flags.ParseWithConsulOptions()
	client := kp.NewConsulClient(opts)
	store := kp.NewConsulStore(client)

	if *nodeName == "" {
		hostname, err := os.Hostname()
		if err != nil {
			log.Fatalf("Could not get the hostname to do scheduling: %s", err)
		}
		*nodeName = hostname
	}

	path := kp.IntentPath(*nodeName)
	if *watchReality {
		path = kp.RealityPath(*nodeName)
	} else if *hooks {
		path = kp.HookPath()
	}
	log.Printf("Watching manifests at %s\n", path)

	quit := make(chan struct{})
	errChan := make(chan error)
	podCh := make(chan []kp.ManifestResult)
	go store.WatchPods(path, quit, errChan, podCh)
	for {
		select {
		case results := <-podCh:
			if len(results) == 0 {
				fmt.Println(fmt.Sprintf("No manifest exists at key %s (it may have been deleted)", path))
			} else {
				for _, result := range results {
					fmt.Println("")
					result.Manifest.Write(os.Stdout)
				}
			}
		case err := <-errChan:
			log.Fatalf("Error occurred while listening to pods: %s", err)
		}
	}
}
Example #12
0
func TestInitializeReplicationFailsIfLockExists(t *testing.T) {
	replicator, store, server := testReplicatorAndServer(t)
	defer server.Stop()

	// This makes it look like the preparers are installed on the hosts
	// we're deploying to
	for _, node := range testNodes {
		key := fmt.Sprintf("reality/%s/p2-preparer", node)
		server.SetKV(key, []byte(testPreparerManifest))
	}

	// Claim a lock on a host and verify that InitializeReplication fails
	lock, _, err := store.NewLock("competing lock", nil)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)
	}
	defer lock.Destroy()
	lockPath := kp.LockPath(kp.IntentPath(testNodes[0], testPodId))
	err = lock.Lock(lockPath)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)
	}

	_, _, err = replicator.InitializeReplication(false)
	if err == nil {
		t.Fatalf("Expected error due to competing lock, but no error occurred")
	}

	matched, err := regexp.MatchString("already held", err.Error())
	if err != nil {
		t.Fatalf("Unable to compare error message to expected string")
	}

	if !matched {
		t.Fatalf("Expected error message to be related to a lock already being held, but was %s", err.Error())
	}
}
Example #13
0
// note: logging should be delegated somehow
func (r Replicator) updateOne(node string, done chan<- string, errCh chan<- error, quitCh <-chan struct{}) {
	targetSHA, _ := r.Manifest.SHA()
	nodeLogger := r.Logger.SubLogger(logrus.Fields{"node": node})
	nodeLogger.WithField("sha", targetSHA).Infoln("Updating node")

	_, err := r.Store.SetPod(kp.IntentPath(node, r.Manifest.ID()), r.Manifest)
	for err != nil {
		nodeLogger.WithField("err", err).Errorln("Could not write intent store")
		errCh <- err
		time.Sleep(1 * time.Second)
		_, err = r.Store.SetPod(kp.IntentPath(node, r.Manifest.ID()), r.Manifest)
	}

	realityResults := make(chan kp.ManifestResult)
	realityErr := make(chan error)
	realityQuit := make(chan struct{})
	defer close(realityQuit)
	go r.Store.WatchPods(kp.RealityPath(node, r.Manifest.ID()), realityQuit, realityErr, realityResults)
REALITY_LOOP:
	for {
		select {
		case <-quitCh:
			return
		case err := <-realityErr:
			nodeLogger.WithField("err", err).Errorln("Could not read reality store")
			errCh <- err
		case mResult := <-realityResults:
			receivedSHA, _ := mResult.Manifest.SHA()
			if receivedSHA == targetSHA {
				break REALITY_LOOP
			} else {
				nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current")
			}
		}
	}
	nodeLogger.NoFields().Infoln("Node is current")

	healthResults := make(chan []health.Result)
	healthErr := make(chan error)
	healthQuit := make(chan struct{})
	defer close(healthQuit)
	go r.Health.WatchNodeService(node, r.Manifest.ID(), healthResults, healthErr, healthQuit)
HEALTH_LOOP:
	for {
		select {
		case <-quitCh:
			return
		case err := <-healthErr:
			nodeLogger.WithField("err", err).Errorln("Could not read health check")
			errCh <- err
		case res := <-healthResults:
			id, status := health.FindWorst(res)
			// treat an empty threshold as "passing"
			threshold := health.Passing
			if r.Threshold != "" {
				threshold = r.Threshold
			}
			// is this status less than the threshold?
			if health.Compare(status, threshold) < 0 {
				nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy")
			} else {
				break HEALTH_LOOP
			}
		}
	}
	r.Logger.WithField("node", node).Infoln("Node is current and healthy")

	select {
	case done <- node:
	case <-quitCh:
	}
}
Example #14
0
func TestStopsIfLockDestroyed(t *testing.T) {
	active := 1
	store, server := makeStore(t)
	defer server.Stop()

	healthChecker, resultsCh := channelHealthChecker(testNodes, t)
	threshold := health.Passing
	manifest := basicManifest()

	// Make the kv store look like preparer is installed on test nodes
	setupPreparers(server)

	// Create the replication manually for this test so we can trigger lock
	// renewals on a faster interval (to keep test short)
	errCh := make(chan error)
	replication := &replication{
		active:    active,
		nodes:     testNodes,
		store:     store,
		manifest:  manifest,
		health:    healthChecker,
		threshold: threshold,
		logger:    basicLogger(),
		errCh:     errCh,
		replicationCancelledCh: make(chan struct{}),
		replicationDoneCh:      make(chan struct{}),
		quitCh:                 make(chan struct{}),
	}

	triggerRenewalCh := make(chan time.Time)
	lock, renewalErrCh, err := store.NewLock(testLockMessage, triggerRenewalCh)
	if err != nil {
		t.Fatalf("Unable to create initial replication lock: %s", err)
	}

	for _, host := range testNodes {
		lockPath := kp.LockPath(kp.IntentPath(host, manifest.ID()))
		err := replication.lock(lock, lockPath, false)

		if err != nil {
			t.Fatalf("Unable to perform initial replication lock: %s", err)
		}
	}
	go replication.handleRenewalErrors(lock, renewalErrCh)

	doneCh := make(chan struct{})

	go func() {
		select {
		case err := <-errCh:
			if err == nil || !IsFatalError(err) {
				t.Fatalf("Should have seen a fatal lock renewal error before replication finished")
			}
		case <-time.After(5 * time.Second):
			t.Fatalf("Did not get expected lock renewal error within timeout")
		}
	}()
	imitatePreparers(server, doneCh)

	go func() {
		replication.Enact()
		close(doneCh)
	}()

	// Report healthy for one node, and unhealthy for the rest so
	// replication cannot finish without interruption
	for i, node := range testNodes {
		if i == 0 {
			go func(node string) {
				for {
					select {
					case resultsCh[node] <- health.Result{
						ID:     testPodId,
						Status: health.Passing,
					}:
					case <-doneCh:
						return
					}
					time.Sleep(500 * time.Millisecond)
				}
			}(node)
		} else {
			go func(node string) {
				for {
					select {
					case resultsCh[node] <- health.Result{
						ID:     testPodId,
						Status: health.Critical,
					}:
					case <-doneCh:
						return
					}
					time.Sleep(500 * time.Millisecond)
				}
			}(node)
		}
	}

	// Wait for the first node to be deployed
	firstNodeDeployed := make(chan struct{})
	manifestBytes, err := manifest.Marshal()
	if err != nil {
		t.Fatalf("Unable to get bytes from manifest: %s", err)
	}
	go func() {
		realityKey := fmt.Sprintf("reality/%s/%s", testNodes[0], testPodId)
		for range time.Tick(10 * time.Millisecond) {
			if bytes.Equal(server.GetKV(realityKey), manifestBytes) {
				close(firstNodeDeployed)
				return
			}
		}
	}()

	select {
	case <-time.After(5 * time.Second):
		t.Fatalf("Took too long for first node to be deployed")
	case <-firstNodeDeployed:
	}

	// Trigger some lock renewals, confirm that replication is still going (doneCh not closed)
	for i := 0; i < 3; i++ {
		select {
		case triggerRenewalCh <- time.Now():
		case <-doneCh:
			t.Fatalf("Replication ended prematurely (lock couldn't be renewed but wasn't destroyed yet)")
		case <-time.After(1 * time.Second):
			t.Fatalf("Test timed out triggering a lock renewal")
		}
	}

	// Destroy lock holder so the next renewal will fail
	lockPath := kp.LockPath(kp.IntentPath(testNodes[0], manifest.ID()))
	_, id, err := store.LockHolder(lockPath)
	if err != nil {
		t.Fatalf("Unable to determine lock holder in order to destroy the lock: %s", err)
	}

	err = store.DestroyLockHolder(id)
	if err != nil {
		t.Fatalf("Unable to destroy lock holder")
	}

	// Trigger one more renewal which should cause replication to stop
	select {
	case triggerRenewalCh <- time.Now():
	case <-time.After(1 * time.Second):
		t.Fatalf("Test timed out triggering a lock renewal")
	case <-doneCh:
		t.Fatalf("Replication ended prematurely")
	}

	select {
	case <-time.After(5 * time.Second):
		t.Fatalf("Took too long for replication to end after lock cancellation")
	case <-doneCh:
	}

	// One node should have been updated because active == 1, the other
	// should not have been because health never passed
	realityBytes := server.GetKV(fmt.Sprintf("reality/%s/%s", testNodes[0], testPodId))

	if !bytes.Equal(realityBytes, manifestBytes) {
		t.Fatalf("Expected reality for %s to be %s: was %s", testNodes[0], string(manifestBytes), string(realityBytes))
	}

	realityBytes = server.GetKV(fmt.Sprintf("reality/%s/%s", testNodes[1], testPodId))
	if bytes.Equal(realityBytes, manifestBytes) {
		t.Fatalf("The second node shouldn't have been deployed to but it was")
	}
}
Example #15
0
func (p *Preparer) WatchForPodManifestsForNode(quitAndAck chan struct{}) {
	pods.Log = p.Logger
	path := kp.IntentPath(p.node)

	// This allows us to signal the goroutine watching consul to quit
	quitChan := make(chan struct{})
	errChan := make(chan error)
	podChan := make(chan []kp.ManifestResult)

	go p.store.WatchPods(path, quitChan, errChan, podChan)

	// we will have one long running goroutine for each app installed on this
	// host. We keep a map of podId => podChan so we can send the new manifests
	// that come in to the appropriate goroutine
	podChanMap := make(map[string]chan ManifestPair)
	// we can't use a shared quit channel for all the goroutines - otherwise,
	// we would exit the program before the goroutines actually accepted the
	// quit signal. to be sure that each goroutine is done, we have to block and
	// wait for it to receive the signal
	quitChanMap := make(map[string]chan struct{})

	for {
		select {
		case err := <-errChan:
			p.Logger.WithError(err).
				Errorln("there was an error reading the manifest")
		case intentResults := <-podChan:
			realityResults, _, err := p.store.ListPods(kp.RealityPath(p.node))
			if err != nil {
				p.Logger.WithError(err).Errorln("Could not check reality")
			} else {
				// if the preparer's own ID is missing from the intent set, we
				// assume it was damaged and discard it
				if !checkResultsForID(intentResults, POD_ID) {
					p.Logger.NoFields().Errorln("Intent results set did not contain p2-preparer pod ID, consul data may be corrupted")
				} else {
					resultPairs := ZipResultSets(intentResults, realityResults)
					for _, pair := range resultPairs {
						if _, ok := podChanMap[pair.ID]; !ok {
							// spin goroutine for this pod
							podChanMap[pair.ID] = make(chan ManifestPair)
							quitChanMap[pair.ID] = make(chan struct{})
							go p.handlePods(podChanMap[pair.ID], quitChanMap[pair.ID])
						}
						podChanMap[pair.ID] <- pair
					}
				}
			}
		case <-quitAndAck:
			for podToQuit, quitCh := range quitChanMap {
				p.Logger.WithField("pod", podToQuit).Infoln("Quitting...")
				quitCh <- struct{}{}
			}
			close(quitChan)
			p.Logger.NoFields().Infoln("Done, acknowledging quit")
			quitAndAck <- struct{}{} // acknowledge quit
			return
		}

	}
}
Example #16
0
// note: logging should be delegated somehow
func (r replication) updateOne(node string, done chan<- string, quitCh <-chan struct{}) {
	targetSHA, _ := r.manifest.SHA()
	nodeLogger := r.logger.SubLogger(logrus.Fields{"node": node})
	nodeLogger.WithField("sha", targetSHA).Infoln("Updating node")

	_, err := r.store.SetPod(kp.IntentPath(node, r.manifest.ID()), r.manifest)
	for err != nil {
		nodeLogger.WithError(err).Errorln("Could not write intent store")
		r.errCh <- err
		time.Sleep(1 * time.Second)
		_, err = r.store.SetPod(kp.IntentPath(node, r.manifest.ID()), r.manifest)
	}

	realityResults := make(chan []kp.ManifestResult)
	realityErr := make(chan error)
	realityQuit := make(chan struct{})
	defer close(realityQuit)
	go r.store.WatchPods(kp.RealityPath(node, r.manifest.ID()), realityQuit, realityErr, realityResults)
REALITY_LOOP:
	for {
		select {
		case <-quitCh:
			return
		case err := <-realityErr:
			nodeLogger.WithError(err).Errorln("Could not read reality store")
			select {
			case r.errCh <- err:
			case <-quitCh:
			}
		case mResult := <-realityResults:
			// We expect len(mResult) == 0 if the pod key doesn't
			// exist yet, that's okay just wait longer
			if len(mResult) == 1 {
				receivedSHA, _ := mResult[0].Manifest.SHA()
				if receivedSHA == targetSHA {
					break REALITY_LOOP
				} else {
					nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current")
				}
			} else if len(mResult) > 1 {
				nodeLogger.WithField("n", len(mResult)).Errorf("Got %d results from reality but was expecting only 1", len(mResult))
			}
		}
	}
	nodeLogger.NoFields().Infoln("Node is current")

	healthResults := make(chan health.Result)
	healthErr := make(chan error)
	healthQuit := make(chan struct{})
	defer close(healthQuit)
	go r.health.WatchNodeService(node, r.manifest.ID(), healthResults, healthErr, healthQuit)
HEALTH_LOOP:
	for {
		select {
		case <-quitCh:
			return
		case err := <-healthErr:
			nodeLogger.WithError(err).Errorln("Could not read health check")
			select {
			case r.errCh <- err:
			case <-quitCh:
			}
		case res := <-healthResults:
			id := res.ID
			status := res.Status
			// treat an empty threshold as "passing"
			threshold := health.Passing
			if r.threshold != "" {
				threshold = r.threshold
			}
			// is this status less than the threshold?
			if health.Compare(status, threshold) < 0 {
				nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy")
			} else {
				break HEALTH_LOOP
			}
		}
	}
	r.logger.WithField("node", node).Infoln("Node is current and healthy")

	select {
	case done <- node:
	case <-quitCh:
	}
}