func (u *update) countHealthy(id rcf.ID, checks map[types.NodeName]health.Result) (rcNodeCounts, error) { ret := rcNodeCounts{} rcFields, err := u.rcs.Get(id) if rcstore.IsNotExist(err) { err := util.Errorf("RC %s did not exist", id) return ret, err } else if err != nil { return ret, err } ret.Desired = rcFields.ReplicasDesired currentPods, err := rc.New(rcFields, u.kps, u.rcs, u.sched, u.labeler, u.logger, u.alerter).CurrentPods() if err != nil { return ret, err } ret.Current = len(currentPods) if ret.Desired > ret.Current { // This implies that the RC hasn't yet scheduled pods that it desires to have. // We consider their health to be unknown in this case. // Note that the below loop over `range currentPods` may also increase `ret.Unknown`. ret.Unknown = ret.Desired - ret.Current } for _, pod := range currentPods { node := pod.Node // TODO: is reality checking an rc-layer concern? realManifest, _, err := u.kps.Pod(kp.REALITY_TREE, node, rcFields.Manifest.ID()) if err != nil && err != pods.NoCurrentManifest { return ret, err } // if realManifest is nil, we use an empty string for comparison purposes against rc // manifest SHA, which works for that purpose. var realSHA string if realManifest != nil { realSHA, _ = realManifest.SHA() } targetSHA, _ := rcFields.Manifest.SHA() if targetSHA == realSHA { ret.Real++ } else { // don't check health if the update isn't even done there yet continue } if hres, ok := checks[node]; ok { if hres.Status == health.Passing { ret.Healthy++ } else if hres.Status == health.Unknown { ret.Unknown++ } else { ret.Unhealthy++ } } else { ret.Unknown++ } } return ret, err }
func (u *update) Run(quit <-chan struct{}) (ret bool) { u.logger.NoFields().Debugln("Locking") // TODO: implement API for blocking locks and use that instead of retrying if !RetryOrQuit( func() error { return u.lockRCs(quit) }, quit, u.logger, "Could not lock rcs", ) { return } defer u.unlockRCs(quit) u.logger.NoFields().Debugln("Enabling") if !RetryOrQuit(u.enable, quit, u.logger, "Could not enable/disable RCs") { return } u.logger.NoFields().Debugln("Launching health watch") var newFields rcf.RC var err error if !RetryOrQuit(func() error { newFields, err = u.rcs.Get(u.NewRC) if rcstore.IsNotExist(err) { return util.Errorf("Replication controller %s is unexpectedly empty", u.NewRC) } else if err != nil { return err } return nil }, quit, u.logger, "Could not read new RC") { return } hChecks := make(chan map[types.NodeName]health.Result) hErrs := make(chan error) hQuit := make(chan struct{}) defer close(hQuit) go u.hcheck.WatchService(string(newFields.Manifest.ID()), hChecks, hErrs, hQuit) if updateSucceeded := u.rollLoop(newFields.Manifest.ID(), hChecks, hErrs, quit); !updateSucceeded { // We were asked to quit. Do so without cleaning old RC. return false } // rollout complete, clean up old RC if told to do so if !u.LeaveOld { u.logger.NoFields().Infoln("Cleaning up old RC") if !RetryOrQuit(func() error { return u.rcs.SetDesiredReplicas(u.OldRC, 0) }, quit, u.logger, "Could not zero old replica count") { return } if !RetryOrQuit(func() error { return u.rcs.Enable(u.OldRC) }, quit, u.logger, "Could not enable old RC") { return } if !RetryOrQuit(func() error { return u.rcs.Delete(u.OldRC, false) }, quit, u.logger, "Could not delete old RC") { return } } return true // finally if we make it here, we can return true }
func (rlf *Farm) mainLoop(quit <-chan struct{}) { subQuit := make(chan struct{}) defer close(subQuit) rlWatch, rlErr := rlf.rls.Watch(subQuit) START_LOOP: for { select { case <-quit: rlf.logger.NoFields().Infoln("Session expired, releasing updates") rlf.session = nil rlf.releaseChildren() return case err := <-rlErr: rlf.logger.WithError(err).Errorln("Could not read consul updates") case rlFields := <-rlWatch: rlf.logger.WithField("n", len(rlFields)).Debugln("Received update update") countHistogram := metrics.GetOrRegisterHistogram("ru_count", p2metrics.Registry, metrics.NewExpDecaySample(1028, 0.015)) countHistogram.Update(int64(len(rlFields))) // track which children were found in the returned set foundChildren := make(map[roll_fields.ID]struct{}) for _, rlField := range rlFields { rlLogger := rlf.logger.SubLogger(logrus.Fields{ "ru": rlField.ID(), }) rcField, err := rlf.rcs.Get(rlField.NewRC) if rcstore.IsNotExist(err) { err := util.Errorf("Expected RC %s to exist", rlField.NewRC) rlLogger.WithError(err).Errorln() continue } else if err != nil { rlLogger.WithError(err).Errorln("Could not read new RC") continue } rlLogger = rlLogger.SubLogger(logrus.Fields{ "pod": rcField.Manifest.ID(), }) if _, ok := rlf.children[rlField.ID()]; ok { // this one is already ours, skip rlLogger.NoFields().Debugln("Got update already owned by self") foundChildren[rlField.ID()] = struct{}{} continue } shouldWorkOnOld, err := rlf.shouldWorkOn(rlField.OldRC) if err != nil { rlLogger.WithError(err).Errorf("Could not determine if should work on RC %s, skipping", rlField.OldRC) continue } if !shouldWorkOnOld { rlLogger.WithField("old_rc", rlField.OldRC).Infof("Ignoring roll for old RC %s, not meant for this farm", rlField.OldRC) continue } shouldWorkOnNew, err := rlf.shouldWorkOn(rlField.NewRC) if err != nil { rlLogger.WithError(err).Errorf("Could not determine if should work on RC %s, skipping", rlField.ID()) continue } if !shouldWorkOnNew { rlLogger.WithField("new_rc", rlField.ID()).Infof("Ignoring roll for new RC %s, not meant for this farm", rlField.ID()) continue } lockPath, err := rollstore.RollLockPath(rlField.ID()) if err != nil { rlLogger.WithError(err).Errorln("Unable to compute roll lock path") } unlocker, err := rlf.session.Lock(lockPath) if _, ok := err.(consulutil.AlreadyLockedError); ok { // someone else must have gotten it first - log and move to // the next one rlLogger.NoFields().Debugln("Lock on update was denied") continue } else if err != nil { rlLogger.WithError(err).Errorln("Got error while locking update - session may be expired") // stop processing this update and go back to the select // chances are this error is a network problem or session // expiry, and all the others in this update would also fail continue START_LOOP } // at this point the ru is ours, time to spin it up rlLogger.WithField("new_rc", rlField.ID()).Infof("Acquired lock on update %s -> %s, spawning", rlField.OldRC, rlField.ID()) newChild := rlf.factory.New(rlField, rlLogger, rlf.session, rlf.alerter) childQuit := make(chan struct{}) rlf.children[rlField.ID()] = childRU{ ru: newChild, quit: childQuit, unlocker: unlocker, } foundChildren[rlField.ID()] = struct{}{} err = rlf.validateRoll(rlField, rlLogger) if err != nil { rlLogger.WithError(err).Errorln("RU was invalid, deleting") // Just delete the RU, the farm will clean up the lock when releaseDeletedChildren() is called rlf.mustDeleteRU(rlField.ID(), rlLogger) continue } newRC := rlField.NewRC go func(id roll_fields.ID) { defer func() { if r := recover(); r != nil { err := util.Errorf("Caught panic in roll farm: %s", r) rlLogger.WithError(err). WithField("new_rc", newRC). Errorln("Caught panic in roll farm") // Release the child so that another farm can reattempt rlf.childMu.Lock() defer rlf.childMu.Unlock() if _, ok := rlf.children[id]; ok { rlf.releaseChild(id) } } }() if !newChild.Run(childQuit) { // returned false, farm must have asked us to quit return } // Block until the RU is deleted because the farm does not release locks until it detects an RU deletion // our lock on this RU won't be released until it's deleted rlf.mustDeleteRU(id, rlLogger) }(rlField.ID()) // do not close over rlField, it's a loop variable } // now remove any children that were not found in the result set rlf.releaseDeletedChildren(foundChildren) } } }