Beispiel #1
0
func (u *update) countHealthy(id rcf.ID, checks map[types.NodeName]health.Result) (rcNodeCounts, error) {
	ret := rcNodeCounts{}
	rcFields, err := u.rcs.Get(id)
	if rcstore.IsNotExist(err) {
		err := util.Errorf("RC %s did not exist", id)
		return ret, err
	} else if err != nil {
		return ret, err
	}

	ret.Desired = rcFields.ReplicasDesired

	currentPods, err := rc.New(rcFields, u.kps, u.rcs, u.sched, u.labeler, u.logger, u.alerter).CurrentPods()
	if err != nil {
		return ret, err
	}
	ret.Current = len(currentPods)

	if ret.Desired > ret.Current {
		// This implies that the RC hasn't yet scheduled pods that it desires to have.
		// We consider their health to be unknown in this case.
		// Note that the below loop over `range currentPods` may also increase `ret.Unknown`.
		ret.Unknown = ret.Desired - ret.Current
	}

	for _, pod := range currentPods {
		node := pod.Node
		// TODO: is reality checking an rc-layer concern?
		realManifest, _, err := u.kps.Pod(kp.REALITY_TREE, node, rcFields.Manifest.ID())
		if err != nil && err != pods.NoCurrentManifest {
			return ret, err
		}

		// if realManifest is nil, we use an empty string for comparison purposes against rc
		// manifest SHA, which works for that purpose.
		var realSHA string
		if realManifest != nil {
			realSHA, _ = realManifest.SHA()
		}
		targetSHA, _ := rcFields.Manifest.SHA()
		if targetSHA == realSHA {
			ret.Real++
		} else {
			// don't check health if the update isn't even done there yet
			continue
		}
		if hres, ok := checks[node]; ok {
			if hres.Status == health.Passing {
				ret.Healthy++
			} else if hres.Status == health.Unknown {
				ret.Unknown++
			} else {
				ret.Unhealthy++
			}
		} else {
			ret.Unknown++
		}
	}
	return ret, err
}
Beispiel #2
0
func (u *update) Run(quit <-chan struct{}) (ret bool) {
	u.logger.NoFields().Debugln("Locking")
	// TODO: implement API for blocking locks and use that instead of retrying
	if !RetryOrQuit(
		func() error { return u.lockRCs(quit) },
		quit,
		u.logger,
		"Could not lock rcs",
	) {
		return
	}
	defer u.unlockRCs(quit)

	u.logger.NoFields().Debugln("Enabling")
	if !RetryOrQuit(u.enable, quit, u.logger, "Could not enable/disable RCs") {
		return
	}

	u.logger.NoFields().Debugln("Launching health watch")
	var newFields rcf.RC
	var err error
	if !RetryOrQuit(func() error {
		newFields, err = u.rcs.Get(u.NewRC)
		if rcstore.IsNotExist(err) {
			return util.Errorf("Replication controller %s is unexpectedly empty", u.NewRC)
		} else if err != nil {
			return err
		}

		return nil
	}, quit, u.logger, "Could not read new RC") {
		return
	}

	hChecks := make(chan map[types.NodeName]health.Result)
	hErrs := make(chan error)
	hQuit := make(chan struct{})
	defer close(hQuit)
	go u.hcheck.WatchService(string(newFields.Manifest.ID()), hChecks, hErrs, hQuit)

	if updateSucceeded := u.rollLoop(newFields.Manifest.ID(), hChecks, hErrs, quit); !updateSucceeded {
		// We were asked to quit. Do so without cleaning old RC.
		return false
	}

	// rollout complete, clean up old RC if told to do so
	if !u.LeaveOld {
		u.logger.NoFields().Infoln("Cleaning up old RC")
		if !RetryOrQuit(func() error { return u.rcs.SetDesiredReplicas(u.OldRC, 0) }, quit, u.logger, "Could not zero old replica count") {
			return
		}
		if !RetryOrQuit(func() error { return u.rcs.Enable(u.OldRC) }, quit, u.logger, "Could not enable old RC") {
			return
		}
		if !RetryOrQuit(func() error { return u.rcs.Delete(u.OldRC, false) }, quit, u.logger, "Could not delete old RC") {
			return
		}
	}
	return true // finally if we make it here, we can return true
}
Beispiel #3
0
Datei: farm.go Projekt: rudle/p2
func (rlf *Farm) mainLoop(quit <-chan struct{}) {
	subQuit := make(chan struct{})
	defer close(subQuit)
	rlWatch, rlErr := rlf.rls.Watch(subQuit)

START_LOOP:
	for {
		select {
		case <-quit:
			rlf.logger.NoFields().Infoln("Session expired, releasing updates")
			rlf.session = nil
			rlf.releaseChildren()
			return
		case err := <-rlErr:
			rlf.logger.WithError(err).Errorln("Could not read consul updates")
		case rlFields := <-rlWatch:
			rlf.logger.WithField("n", len(rlFields)).Debugln("Received update update")
			countHistogram := metrics.GetOrRegisterHistogram("ru_count", p2metrics.Registry, metrics.NewExpDecaySample(1028, 0.015))
			countHistogram.Update(int64(len(rlFields)))

			// track which children were found in the returned set
			foundChildren := make(map[roll_fields.ID]struct{})
			for _, rlField := range rlFields {

				rlLogger := rlf.logger.SubLogger(logrus.Fields{
					"ru": rlField.ID(),
				})
				rcField, err := rlf.rcs.Get(rlField.NewRC)
				if rcstore.IsNotExist(err) {
					err := util.Errorf("Expected RC %s to exist", rlField.NewRC)
					rlLogger.WithError(err).Errorln()
					continue
				} else if err != nil {
					rlLogger.WithError(err).Errorln("Could not read new RC")
					continue
				}

				rlLogger = rlLogger.SubLogger(logrus.Fields{
					"pod": rcField.Manifest.ID(),
				})
				if _, ok := rlf.children[rlField.ID()]; ok {
					// this one is already ours, skip
					rlLogger.NoFields().Debugln("Got update already owned by self")
					foundChildren[rlField.ID()] = struct{}{}
					continue
				}

				shouldWorkOnOld, err := rlf.shouldWorkOn(rlField.OldRC)
				if err != nil {
					rlLogger.WithError(err).Errorf("Could not determine if should work on RC %s, skipping", rlField.OldRC)
					continue
				}
				if !shouldWorkOnOld {
					rlLogger.WithField("old_rc", rlField.OldRC).Infof("Ignoring roll for old RC %s, not meant for this farm", rlField.OldRC)
					continue
				}

				shouldWorkOnNew, err := rlf.shouldWorkOn(rlField.NewRC)
				if err != nil {
					rlLogger.WithError(err).Errorf("Could not determine if should work on RC %s, skipping", rlField.ID())
					continue
				}
				if !shouldWorkOnNew {
					rlLogger.WithField("new_rc", rlField.ID()).Infof("Ignoring roll for new RC %s, not meant for this farm", rlField.ID())
					continue
				}

				lockPath, err := rollstore.RollLockPath(rlField.ID())
				if err != nil {
					rlLogger.WithError(err).Errorln("Unable to compute roll lock path")
				}

				unlocker, err := rlf.session.Lock(lockPath)
				if _, ok := err.(consulutil.AlreadyLockedError); ok {
					// someone else must have gotten it first - log and move to
					// the next one
					rlLogger.NoFields().Debugln("Lock on update was denied")
					continue
				} else if err != nil {
					rlLogger.WithError(err).Errorln("Got error while locking update - session may be expired")
					// stop processing this update and go back to the select
					// chances are this error is a network problem or session
					// expiry, and all the others in this update would also fail
					continue START_LOOP
				}

				// at this point the ru is ours, time to spin it up
				rlLogger.WithField("new_rc", rlField.ID()).Infof("Acquired lock on update %s -> %s, spawning", rlField.OldRC, rlField.ID())

				newChild := rlf.factory.New(rlField, rlLogger, rlf.session, rlf.alerter)
				childQuit := make(chan struct{})
				rlf.children[rlField.ID()] = childRU{
					ru:       newChild,
					quit:     childQuit,
					unlocker: unlocker,
				}
				foundChildren[rlField.ID()] = struct{}{}

				err = rlf.validateRoll(rlField, rlLogger)
				if err != nil {
					rlLogger.WithError(err).Errorln("RU was invalid, deleting")

					// Just delete the RU, the farm will clean up the lock when releaseDeletedChildren() is called
					rlf.mustDeleteRU(rlField.ID(), rlLogger)
					continue
				}

				newRC := rlField.NewRC
				go func(id roll_fields.ID) {
					defer func() {
						if r := recover(); r != nil {
							err := util.Errorf("Caught panic in roll farm: %s", r)
							rlLogger.WithError(err).
								WithField("new_rc", newRC).
								Errorln("Caught panic in roll farm")

							// Release the child so that another farm can reattempt
							rlf.childMu.Lock()
							defer rlf.childMu.Unlock()
							if _, ok := rlf.children[id]; ok {
								rlf.releaseChild(id)
							}
						}
					}()
					if !newChild.Run(childQuit) {
						// returned false, farm must have asked us to quit
						return
					}

					// Block until the RU is deleted because the farm does not release locks until it detects an RU deletion
					// our lock on this RU won't be released until it's deleted
					rlf.mustDeleteRU(id, rlLogger)
				}(rlField.ID()) // do not close over rlField, it's a loop variable
			}

			// now remove any children that were not found in the result set
			rlf.releaseDeletedChildren(foundChildren)
		}
	}
}