Esempio n. 1
func VerifyReality(waitTime time.Duration, consulID, agentID string) error {
	quit := make(chan struct{})
	defer close(quit)
	store := kp.NewConsulStore(kp.Options{
		Token: *consulToken,
	hostname, _ := os.Hostname()
	waitChan := time.After(waitTime)
	for {
		select {
		case <-waitChan:
			return util.Errorf("Consul and/or Preparer weren't in the reality store within %s", waitTime)
		case <-time.After(100 * time.Millisecond):
			hasConsul := false
			hasPreparer := false
			results, _, err := store.ListPods(kp.RealityPath(hostname))
			if err != nil {
				log.Printf("Error looking for pods: %s\n", err)
			for _, res := range results {
				if res.Manifest.ID() == consulID {
					hasConsul = true
				} else if res.Manifest.ID() == agentID {
					hasPreparer = true
			if hasConsul && hasPreparer {
				return nil
Esempio n. 2
func (u update) countHealthy(id rcf.ID, checks map[string]health.Result) (rcNodeCounts, error) {
	ret := rcNodeCounts{}
	rcFields, err := u.rcs.Get(id)
	if err != nil {
		return ret, err
	ret.Desired = rcFields.ReplicasDesired

	nodes, err := rc.New(rcFields, u.kps, u.rcs, u.sched, u.labeler, u.logger).CurrentNodes()
	if err != nil {
		return ret, err
	ret.Current = len(nodes)

	for _, node := range nodes {
		// TODO: is reality checking an rc-layer concern?
		realManifest, _, err := u.kps.Pod(kp.RealityPath(node, rcFields.Manifest.ID()))
		if err != nil {
			return ret, err
		realSHA, _ := realManifest.SHA()
		targetSHA, _ := rcFields.Manifest.SHA()
		if targetSHA == realSHA {
		} else {
			// don't check health if the update isn't even done there yet
		if hres, ok := checks[node]; ok && hres.Status == health.Passing {
	return ret, err
Esempio n. 3
// Checks that the preparer is running on every host being deployed to.
func (r Replicator) CheckPreparers() error {
	for _, host := range r.Nodes {
		_, _, err := r.Store.Pod(kp.RealityPath(host, preparer.POD_ID))
		if err != nil {
			return util.Errorf("Host %q does not have a preparer", host)
	return nil
Esempio n. 4
// Checks that the preparer is running on every host being deployed to.
func (r replicator) checkPreparers() error {
	for _, host := range r.nodes {
		_, _, err :=, preparer.POD_ID))
		if err != nil {
			return util.Errorf("Could not verify %v state on %q: %v", preparer.POD_ID, host, err)
	return nil
Esempio n. 5
func updateHealthMonitors(store kp.Store, watchedPods []PodWatch, node string, logger *logging.Logger) []PodWatch {
	path := kp.RealityPath(node)
	reality, _, err := store.ListPods(path)
	if err != nil {
		logger.WithField("inner_err", err).Warningln("failed to get pods from reality store")

	return updatePods(watchedPods, reality, logger, store, node)
Esempio n. 6
func (p *Preparer) installAndLaunchPod(pair ManifestPair, pod Pod, logger logging.Logger) bool {
	p.tryRunHooks(hooks.BEFORE_INSTALL, pod, pair.Intent, logger)

	err := pod.Install(pair.Intent)
	if err != nil {
		// install failed, abort and retry
		logger.WithError(err).Errorln("Install failed")
		return false

	err = pod.Verify(pair.Intent, p.authPolicy)
	if err != nil {
			Errorln("Pod digest verification failed")
		p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, pair.Intent, logger)
		return false

	p.tryRunHooks(hooks.AFTER_INSTALL, pod, pair.Intent, logger)

	if pair.Reality != nil {
		success, err := pod.Halt(pair.Reality)
		if err != nil {
				Errorln("Pod halt failed")
		} else if !success {
			logger.NoFields().Warnln("One or more launchables did not halt successfully")

	p.tryRunHooks(hooks.BEFORE_LAUNCH, pod, pair.Intent, logger)

	ok, err := pod.Launch(pair.Intent)
	if err != nil {
			Errorln("Launch failed")
	} else {
		duration, err :=, pair.ID), pair.Intent)
		if err != nil {
			logger.WithErrorAndFields(err, logrus.Fields{
				"duration": duration}).
				Errorln("Could not set pod in reality store")

		p.tryRunHooks(hooks.AFTER_LAUNCH, pod, pair.Intent, logger)
	return err == nil && ok
Esempio n. 7
func main() {

	store := kp.NewConsulStore(kp.Options{
		Address: *consulAddress,
		Token:   *consulToken,
		Client:  net.NewHeaderClient(*headers, http.DefaultTransport),
		HTTPS:   *https,

	if *nodeName == "" {
		hostname, err := os.Hostname()
		if err != nil {
			log.Fatalf("Could not get the hostname to do scheduling: %s", err)
		*nodeName = hostname

	path := kp.IntentPath(*nodeName)
	if *watchReality {
		path = kp.RealityPath(*nodeName)
	} else if *hookTypeName != "" {
		hookType, err := hooks.AsHookType(*hookTypeName)
		if err != nil {
		path = kp.HookPath(hookType, *nodeName)
	log.Printf("Watching manifests at %s\n", path)

	quit := make(chan struct{})
	errChan := make(chan error)
	podCh := make(chan kp.ManifestResult)
	go store.WatchPods(path, quit, errChan, podCh)
	for {
		select {
		case result := <-podCh:
		case err := <-errChan:
			log.Fatalf("Error occurred while listening to pods: %s", err)
Esempio n. 8
func ScheduleForThisHost(manifest pods.Manifest, alsoReality bool) error {
	store := kp.NewConsulStore(kp.NewConsulClient(kp.Options{
		Token: *consulToken,
	hostname, err := os.Hostname()
	if err != nil {
		return err
	_, err = store.SetPod(kp.IntentPath(hostname, manifest.ID()), manifest)
	if err != nil {
		return err

	if alsoReality {
		_, err = store.SetPod(kp.RealityPath(hostname, manifest.ID()), manifest)
		return err
	return nil
Esempio n. 9
File: main.go Progetto: tomzhang/p2
func main() {
	_, opts := flags.ParseWithConsulOptions()
	client := kp.NewConsulClient(opts)
	store := kp.NewConsulStore(client)

	if *nodeName == "" {
		hostname, err := os.Hostname()
		if err != nil {
			log.Fatalf("Could not get the hostname to do scheduling: %s", err)
		*nodeName = hostname

	path := kp.IntentPath(*nodeName)
	if *watchReality {
		path = kp.RealityPath(*nodeName)
	} else if *hooks {
		path = kp.HookPath()
	log.Printf("Watching manifests at %s\n", path)

	quit := make(chan struct{})
	errChan := make(chan error)
	podCh := make(chan []kp.ManifestResult)
	go store.WatchPods(path, quit, errChan, podCh)
	for {
		select {
		case results := <-podCh:
			if len(results) == 0 {
				fmt.Println(fmt.Sprintf("No manifest exists at key %s (it may have been deleted)", path))
			} else {
				for _, result := range results {
		case err := <-errChan:
			log.Fatalf("Error occurred while listening to pods: %s", err)
Esempio n. 10
// MonitorPodHealth is meant to be a long running go routine.
// MonitorPodHealth reads from a consul store to determine which
// services should be running on the host. MonitorPodHealth
// runs a CheckHealth routine to monitor the health of each
// service and kills routines for services that should no
// longer be running.
func MonitorPodHealth(config *preparer.PreparerConfig, logger *logging.Logger, shutdownCh chan struct{}) {
	store, err := config.GetStore()
	if err != nil {
		// A bad config should have already produced a nice, user-friendly error message.
		logger.WithError(err).Fatalln("error creating health monitor KV store")
	healthManager := store.NewHealthManager(config.NodeName, *logger)
	// if GetClient fails it means the certfile/keyfile/cafile were
	// invalid or did not exist. It makes sense to throw a fatal error
	client, err := config.GetClient()
	if err != nil {
		logger.WithError(err).Fatalln("failed to get http client for this preparer")

	node := config.NodeName
	pods := []PodWatch{}

	watchQuitCh := make(chan struct{})
	watchErrCh := make(chan error)
	watchPodCh := make(chan []kp.ManifestResult)
	go store.WatchPods(kp.RealityPath(node), watchQuitCh, watchErrCh, watchPodCh)

	for {
		select {
		case results := <-watchPodCh:
			// check if pods have been added or removed
			// starts monitor routine for new pods
			// kills monitor routine for removed pods
			pods = updatePods(healthManager, client, pods, results, node, logger)
		case err := <-watchErrCh:
			logger.WithError(err).Errorln("there was an error reading reality manifests for health monitor")
		case <-shutdownCh:
			for _, pod := range pods {
				pod.shutdownCh <- true
Esempio n. 11
func (p *Preparer) stopAndUninstallPod(pair ManifestPair, pod Pod, logger logging.Logger) bool {
	success, err := pod.Halt(pair.Reality)
	if err != nil {
		logger.WithError(err).Errorln("Pod halt failed")
	} else if !success {
		logger.NoFields().Warnln("One or more launchables did not halt successfully")

	p.tryRunHooks(hooks.BEFORE_UNINSTALL, pod, pair.Reality, logger)

	err = pod.Uninstall()
	if err != nil {
		logger.WithError(err).Errorln("Uninstall failed")
		return false
	logger.NoFields().Infoln("Successfully uninstalled")

	dur, err :=, pair.ID))
	if err != nil {
		logger.WithErrorAndFields(err, logrus.Fields{"duration": dur}).
			Errorln("Could not delete pod from reality store")
	return true
Esempio n. 12
// note: logging should be delegated somehow
func (r Replicator) updateOne(node string, done chan<- string, errCh chan<- error, quitCh <-chan struct{}) {
	targetSHA, _ := r.Manifest.SHA()
	nodeLogger := r.Logger.SubLogger(logrus.Fields{"node": node})
	nodeLogger.WithField("sha", targetSHA).Infoln("Updating node")

	_, err := r.Store.SetPod(kp.IntentPath(node, r.Manifest.ID()), r.Manifest)
	for err != nil {
		nodeLogger.WithField("err", err).Errorln("Could not write intent store")
		errCh <- err
		time.Sleep(1 * time.Second)
		_, err = r.Store.SetPod(kp.IntentPath(node, r.Manifest.ID()), r.Manifest)

	realityResults := make(chan kp.ManifestResult)
	realityErr := make(chan error)
	realityQuit := make(chan struct{})
	defer close(realityQuit)
	go r.Store.WatchPods(kp.RealityPath(node, r.Manifest.ID()), realityQuit, realityErr, realityResults)
	for {
		select {
		case <-quitCh:
		case err := <-realityErr:
			nodeLogger.WithField("err", err).Errorln("Could not read reality store")
			errCh <- err
		case mResult := <-realityResults:
			receivedSHA, _ := mResult.Manifest.SHA()
			if receivedSHA == targetSHA {
				break REALITY_LOOP
			} else {
				nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current")
	nodeLogger.NoFields().Infoln("Node is current")

	healthResults := make(chan []health.Result)
	healthErr := make(chan error)
	healthQuit := make(chan struct{})
	defer close(healthQuit)
	go r.Health.WatchNodeService(node, r.Manifest.ID(), healthResults, healthErr, healthQuit)
	for {
		select {
		case <-quitCh:
		case err := <-healthErr:
			nodeLogger.WithField("err", err).Errorln("Could not read health check")
			errCh <- err
		case res := <-healthResults:
			id, status := health.FindWorst(res)
			// treat an empty threshold as "passing"
			threshold := health.Passing
			if r.Threshold != "" {
				threshold = r.Threshold
			// is this status less than the threshold?
			if health.Compare(status, threshold) < 0 {
				nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy")
			} else {
				break HEALTH_LOOP
	r.Logger.WithField("node", node).Infoln("Node is current and healthy")

	select {
	case done <- node:
	case <-quitCh:
Esempio n. 13
func (p *Preparer) installAndLaunchPod(newManifest *pods.Manifest, pod Pod, logger logging.Logger) bool {
	// do not remove the logger argument, it's not the same as p.Logger

	// get currently running pod to compare with the new pod
	realityPath := kp.RealityPath(p.node, newManifest.ID())
	currentManifest, _, err :=
	currentSHA := ""
	if currentManifest != nil {
		currentSHA, _ = currentManifest.SHA()
	newSHA, _ := newManifest.SHA()

	// if new or the manifest is different, launch
	newOrDifferent := (err == pods.NoCurrentManifest) || (currentSHA != newSHA)
	if newOrDifferent {
			"old_sha": currentSHA,
			"sha":     newSHA,
			"pod":     newManifest.ID(),
		}).Infoln("SHA is new or different from old, will update")

	// if the old manifest is corrupted somehow, re-launch since we don't know if this is an update.
	problemReadingCurrentManifest := (err != nil && err != pods.NoCurrentManifest)
	if problemReadingCurrentManifest {
			"sha":       newSHA,
			"inner_err": err,
		}).Errorln("Current manifest not readable, will relaunch")

	if newOrDifferent || problemReadingCurrentManifest {
		p.tryRunHooks(hooks.BEFORE_INSTALL, pod, newManifest, logger)

		err = pod.Install(newManifest)
		if err != nil {
			// install failed, abort and retry
				"err": err,
			}).Errorln("Install failed")
			return false

		err = pod.Verify(newManifest, p.authPolicy)
		if err != nil {
			logger.WithField("err", err).Errorln("Pod digest verification failed")
			p.tryRunHooks(hooks.AFTER_AUTH_FAIL, pod, newManifest, logger)
			return false

		p.tryRunHooks(hooks.AFTER_INSTALL, pod, newManifest, logger)

		err =*newManifest, p.caPath)
		if err != nil {
			logger.WithField("err", err).Errorln("Service registration failed")
			return false

		if currentManifest != nil {
			success, err := pod.Halt(currentManifest)
			if err != nil {
				logger.WithField("err", err).Errorln("Pod halt failed")
			} else if !success {
				logger.NoFields().Warnln("One or more launchables did not halt successfully")

		ok, err := pod.Launch(newManifest)
		if err != nil {
				"err": err,
			}).Errorln("Launch failed")
		} else {
			duration, err :=, *newManifest)
			if err != nil {
					"err":      err,
					"duration": duration,
				}).Errorln("Could not set pod in reality store")

			p.tryRunHooks(hooks.AFTER_LAUNCH, pod, newManifest, logger)
		return err == nil && ok

	// TODO: shut down removed launchables between pod versions.
	return true
Esempio n. 14
func (p *Preparer) WatchForPodManifestsForNode(quitAndAck chan struct{}) {
	pods.Log = p.Logger
	path := kp.IntentPath(p.node)

	// This allows us to signal the goroutine watching consul to quit
	quitChan := make(chan struct{})
	errChan := make(chan error)
	podChan := make(chan []kp.ManifestResult)

	go, quitChan, errChan, podChan)

	// we will have one long running goroutine for each app installed on this
	// host. We keep a map of podId => podChan so we can send the new manifests
	// that come in to the appropriate goroutine
	podChanMap := make(map[string]chan ManifestPair)
	// we can't use a shared quit channel for all the goroutines - otherwise,
	// we would exit the program before the goroutines actually accepted the
	// quit signal. to be sure that each goroutine is done, we have to block and
	// wait for it to receive the signal
	quitChanMap := make(map[string]chan struct{})

	for {
		select {
		case err := <-errChan:
				Errorln("there was an error reading the manifest")
		case intentResults := <-podChan:
			realityResults, _, err :=
			if err != nil {
				p.Logger.WithError(err).Errorln("Could not check reality")
			} else {
				// if the preparer's own ID is missing from the intent set, we
				// assume it was damaged and discard it
				if !checkResultsForID(intentResults, POD_ID) {
					p.Logger.NoFields().Errorln("Intent results set did not contain p2-preparer pod ID, consul data may be corrupted")
				} else {
					resultPairs := ZipResultSets(intentResults, realityResults)
					for _, pair := range resultPairs {
						if _, ok := podChanMap[pair.ID]; !ok {
							// spin goroutine for this pod
							podChanMap[pair.ID] = make(chan ManifestPair)
							quitChanMap[pair.ID] = make(chan struct{})
							go p.handlePods(podChanMap[pair.ID], quitChanMap[pair.ID])
						podChanMap[pair.ID] <- pair
		case <-quitAndAck:
			for podToQuit, quitCh := range quitChanMap {
				p.Logger.WithField("pod", podToQuit).Infoln("Quitting...")
				quitCh <- struct{}{}
			p.Logger.NoFields().Infoln("Done, acknowledging quit")
			quitAndAck <- struct{}{} // acknowledge quit

Esempio n. 15
// note: logging should be delegated somehow
func (r replication) updateOne(node string, done chan<- string, quitCh <-chan struct{}) {
	targetSHA, _ := r.manifest.SHA()
	nodeLogger := r.logger.SubLogger(logrus.Fields{"node": node})
	nodeLogger.WithField("sha", targetSHA).Infoln("Updating node")

	_, err :=, r.manifest.ID()), r.manifest)
	for err != nil {
		nodeLogger.WithError(err).Errorln("Could not write intent store")
		r.errCh <- err
		time.Sleep(1 * time.Second)
		_, err =, r.manifest.ID()), r.manifest)

	realityResults := make(chan []kp.ManifestResult)
	realityErr := make(chan error)
	realityQuit := make(chan struct{})
	defer close(realityQuit)
	go, r.manifest.ID()), realityQuit, realityErr, realityResults)
	for {
		select {
		case <-quitCh:
		case err := <-realityErr:
			nodeLogger.WithError(err).Errorln("Could not read reality store")
			select {
			case r.errCh <- err:
			case <-quitCh:
		case mResult := <-realityResults:
			// We expect len(mResult) == 0 if the pod key doesn't
			// exist yet, that's okay just wait longer
			if len(mResult) == 1 {
				receivedSHA, _ := mResult[0].Manifest.SHA()
				if receivedSHA == targetSHA {
					break REALITY_LOOP
				} else {
					nodeLogger.WithFields(logrus.Fields{"current": receivedSHA, "target": targetSHA}).Infoln("Waiting for current")
			} else if len(mResult) > 1 {
				nodeLogger.WithField("n", len(mResult)).Errorf("Got %d results from reality but was expecting only 1", len(mResult))
	nodeLogger.NoFields().Infoln("Node is current")

	healthResults := make(chan health.Result)
	healthErr := make(chan error)
	healthQuit := make(chan struct{})
	defer close(healthQuit)
	go, r.manifest.ID(), healthResults, healthErr, healthQuit)
	for {
		select {
		case <-quitCh:
		case err := <-healthErr:
			nodeLogger.WithError(err).Errorln("Could not read health check")
			select {
			case r.errCh <- err:
			case <-quitCh:
		case res := <-healthResults:
			id := res.ID
			status := res.Status
			// treat an empty threshold as "passing"
			threshold := health.Passing
			if r.threshold != "" {
				threshold = r.threshold
			// is this status less than the threshold?
			if health.Compare(status, threshold) < 0 {
				nodeLogger.WithFields(logrus.Fields{"check": id, "health": status}).Infoln("Node is not healthy")
			} else {
				break HEALTH_LOOP
	r.logger.WithField("node", node).Infoln("Node is current and healthy")

	select {
	case done <- node:
	case <-quitCh: