func (test crashInfoBackoffTest) Test() { Context(fmt.Sprintf("when the crashCount is %d and the wait time is %s", test.CrashCount, test.WaitTime), func() { It("should NOT restart before the expected wait time", func() { calc := models.NewDefaultRestartCalculator() currentTimestamp := test.GetSince() + test.WaitTime.Nanoseconds() - time.Second.Nanoseconds() Expect(test.ShouldRestartCrash(time.Unix(0, currentTimestamp), calc)).To(BeFalse()) }) It("should restart after the expected wait time", func() { calc := models.NewDefaultRestartCalculator() currentTimestamp := test.GetSince() + test.WaitTime.Nanoseconds() Expect(test.ShouldRestartCrash(time.Unix(0, currentTimestamp), calc)).To(BeTrue()) }) }) }
func (db *ETCDDB) ConvergeLRPs(logger lager.Logger, cellSet models.CellSet) ([]*auctioneer.LRPStartRequest, []*models.ActualLRPKeyWithSchedulingInfo, []*models.ActualLRPKey) { convergeStart := db.clock.Now() convergeLRPRunsCounter.Increment() logger = logger.Session("etcd") logger.Info("starting-convergence") defer logger.Info("finished-convergence") defer func() { err := convergeLRPDuration.Send(time.Since(convergeStart)) if err != nil { logger.Error("failed-sending-converge-lrp-duration-metric", err) } }() logger.Debug("gathering-convergence-input") input, err := db.GatherAndPruneLRPs(logger, cellSet) if err != nil { logger.Error("failed-gathering-convergence-input", err) return nil, nil, nil } logger.Debug("succeeded-gathering-convergence-input") changes := CalculateConvergence(logger, db.clock, models.NewDefaultRestartCalculator(), input) return db.ResolveConvergence(logger, input.DesiredLRPs, changes) }
func (db *ETCDDB) CrashActualLRP(logger lager.Logger, key *models.ActualLRPKey, instanceKey *models.ActualLRPInstanceKey, errorMessage string) (*models.ActualLRPGroup, *models.ActualLRPGroup, bool, error) { logger = logger.WithData(lager.Data{"actual_lrp_key": key, "actual_lrp_instance_key": instanceKey}) logger.Info("starting") lrp, prevIndex, err := db.rawActualLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index) if err != nil { logger.Error("failed-to-get-actual-lrp", err) return nil, nil, false, err } beforeActualLRP := *lrp latestChangeTime := time.Duration(db.clock.Now().UnixNano() - lrp.Since) var newCrashCount int32 if latestChangeTime > models.CrashResetTimeout && lrp.State == models.ActualLRPStateRunning { newCrashCount = 1 } else { newCrashCount = lrp.CrashCount + 1 } logger.Debug("retrieved-lrp") if !lrp.AllowsTransitionTo(key, instanceKey, models.ActualLRPStateCrashed) { logger.Error("failed-to-transition-to-crashed", nil, lager.Data{"from_state": lrp.State, "same_instance_key": lrp.ActualLRPInstanceKey.Equal(instanceKey)}) return nil, nil, false, models.ErrActualLRPCannotBeCrashed } lrp.State = models.ActualLRPStateCrashed lrp.Since = db.clock.Now().UnixNano() lrp.CrashCount = newCrashCount lrp.ActualLRPInstanceKey = models.ActualLRPInstanceKey{} lrp.ActualLRPNetInfo = models.EmptyActualLRPNetInfo() lrp.ModificationTag.Increment() lrp.CrashReason = errorMessage var immediateRestart bool if lrp.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) { lrp.State = models.ActualLRPStateUnclaimed immediateRestart = true } lrpData, serializeErr := db.serializeModel(logger, lrp) if serializeErr != nil { return nil, nil, false, serializeErr } _, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), lrpData, 0, prevIndex) if err != nil { logger.Error("failed", err) return nil, nil, false, models.ErrActualLRPCannotBeCrashed } logger.Info("succeeded") return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: lrp}, immediateRestart, nil }
func (test crashInfoAlwaysStartTest) Test() { Context(fmt.Sprintf("when the crashCount is %d", test.CrashCount), func() { It("should restart regardless of the wait time", func() { calc := models.NewDefaultRestartCalculator() theFuture := test.GetSince() + time.Hour.Nanoseconds() Expect(test.ShouldRestartCrash(time.Unix(0, 0), calc)).To(BeTrue()) Expect(test.ShouldRestartCrash(time.Unix(0, test.GetSince()), calc)).To(BeTrue()) Expect(test.ShouldRestartCrash(time.Unix(0, theFuture), calc)).To(BeTrue()) }) }) }
func (db *ETCDDB) ConvergeLRPs(logger lager.Logger) { convergeStart := db.clock.Now() convergeLRPRunsCounter.Increment() logger = logger.Session("converge-lrps") logger.Info("starting-convergence") defer logger.Info("finished-convergence") defer func() { convergeLRPDuration.Send(time.Since(convergeStart)) }() logger.Debug("gathering-convergence-input") input, err := db.GatherAndPruneLRPs(logger) if err != nil { logger.Error("failed-gathering-convergence-input", err) return } logger.Debug("succeeded-gathering-convergence-input") changes := CalculateConvergence(logger, db.clock, models.NewDefaultRestartCalculator(), input) db.ResolveConvergence(logger, input.DesiredLRPs, changes) }
Expect(calc.ShouldRestart(0, 0, 3)).To(BeFalse()) Expect(calc.ShouldRestart(nanoseconds(30), 0, 3)).To(BeTrue()) Expect(calc.ShouldRestart(nanoseconds(30), 0, 4)).To(BeFalse()) Expect(calc.ShouldRestart(nanoseconds(59), 0, 4)).To(BeFalse()) Expect(calc.ShouldRestart(nanoseconds(60), 0, 4)).To(BeTrue()) Expect(calc.ShouldRestart(nanoseconds(60), 0, 5)).To(BeFalse()) Expect(calc.ShouldRestart(nanoseconds(118), 0, 5)).To(BeFalse()) Expect(calc.ShouldRestart(nanoseconds(119), 0, 5)).To(BeTrue()) }) }) Describe("Validate", func() { It("the default values are valid", func() { calc := models.NewDefaultRestartCalculator() Expect(calc.Validate()).NotTo(HaveOccurred()) }) It("invalid when MaxBackoffDuration is lower than the CrashBackoffMinDuration", func() { calc := models.NewRestartCalculator(models.DefaultImmediateRestarts, models.CrashBackoffMinDuration-time.Second, models.DefaultMaxRestarts) Expect(calc.Validate()).To(HaveOccurred()) }) }) }) var _ = Describe("ActualLRP", func() { Describe("ShouldRestartCrash", func() { Context("when the lpr is CRASHED", func() { const maxWaitTime = 16 * time.Minute var now = time.Now().UnixNano()
func (db *ETCDDB) CrashActualLRP(logger lager.Logger, request *models.CrashActualLRPRequest) *models.Error { key := request.ActualLrpKey instanceKey := request.ActualLrpInstanceKey errorMessage := request.ErrorMessage logger.Info("starting") lrp, prevIndex, bbsErr := db.rawActuaLLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index) if bbsErr != nil { logger.Error("failed-to-get-actual-lrp", bbsErr) return bbsErr } latestChangeTime := time.Duration(db.clock.Now().UnixNano() - lrp.Since) var newCrashCount int32 if latestChangeTime > models.CrashResetTimeout && lrp.State == models.ActualLRPStateRunning { newCrashCount = 1 } else { newCrashCount = lrp.CrashCount + 1 } logger.Debug("retrieved-lrp") if !lrp.AllowsTransitionTo(key, instanceKey, models.ActualLRPStateCrashed) { err := fmt.Errorf("cannot transition crashed lrp from state %s to state %s", lrp.State, models.ActualLRPStateCrashed) logger.Error("failed-to-transition-actual", err) return models.ErrActualLRPCannotBeCrashed } if lrp.State == models.ActualLRPStateUnclaimed || lrp.State == models.ActualLRPStateCrashed || ((lrp.State == models.ActualLRPStateClaimed || lrp.State == models.ActualLRPStateRunning) && !lrp.ActualLRPInstanceKey.Equal(instanceKey)) { return models.ErrActualLRPCannotBeCrashed } lrp.State = models.ActualLRPStateCrashed lrp.Since = db.clock.Now().UnixNano() lrp.CrashCount = newCrashCount lrp.ActualLRPInstanceKey = models.ActualLRPInstanceKey{} lrp.ActualLRPNetInfo = models.EmptyActualLRPNetInfo() lrp.ModificationTag.Increment() lrp.CrashReason = errorMessage var immediateRestart bool if lrp.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) { lrp.State = models.ActualLRPStateUnclaimed immediateRestart = true } lrpRawJSON, err := json.Marshal(lrp) if err != nil { return models.ErrSerializeJSON } _, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), string(lrpRawJSON), 0, "", prevIndex) if err != nil { logger.Error("failed", err) return models.ErrActualLRPCannotBeCrashed } if immediateRestart { auctionErr := db.requestLRPAuctionForLRPKey(logger, key) if err != nil { return auctionErr } } logger.Info("succeeded") return nil }
func (db *ETCDDB) CrashActualLRP(logger lager.Logger, key *models.ActualLRPKey, instanceKey *models.ActualLRPInstanceKey, errorMessage string) error { logger = logger.Session("crash-actual-lrp", lager.Data{"actual_lrp_key": key, "actual_lrp_instance_key": instanceKey}) logger.Info("starting") lrp, prevIndex, err := db.rawActuaLLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index) if err != nil { logger.Error("failed-to-get-actual-lrp", err) return err } latestChangeTime := time.Duration(db.clock.Now().UnixNano() - lrp.Since) var newCrashCount int32 if latestChangeTime > models.CrashResetTimeout && lrp.State == models.ActualLRPStateRunning { newCrashCount = 1 } else { newCrashCount = lrp.CrashCount + 1 } logger.Debug("retrieved-lrp") if !lrp.AllowsTransitionTo(key, instanceKey, models.ActualLRPStateCrashed) { err := fmt.Errorf("cannot transition crashed lrp from state %s to state %s", lrp.State, models.ActualLRPStateCrashed) logger.Error("failed-to-transition-actual", err) return models.ErrActualLRPCannotBeCrashed } if lrp.State == models.ActualLRPStateUnclaimed || lrp.State == models.ActualLRPStateCrashed || ((lrp.State == models.ActualLRPStateClaimed || lrp.State == models.ActualLRPStateRunning) && !lrp.ActualLRPInstanceKey.Equal(instanceKey)) { logger.Debug("cannot-be-crashed", lager.Data{"state": lrp.State, "same-instance-key": lrp.ActualLRPInstanceKey.Equal(instanceKey)}) return models.ErrActualLRPCannotBeCrashed } lrp.State = models.ActualLRPStateCrashed lrp.Since = db.clock.Now().UnixNano() lrp.CrashCount = newCrashCount lrp.ActualLRPInstanceKey = models.ActualLRPInstanceKey{} lrp.ActualLRPNetInfo = models.EmptyActualLRPNetInfo() lrp.ModificationTag.Increment() lrp.CrashReason = errorMessage var immediateRestart bool if lrp.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) { lrp.State = models.ActualLRPStateUnclaimed immediateRestart = true } lrpData, serializeErr := db.serializeModel(logger, lrp) if serializeErr != nil { return serializeErr } _, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), lrpData, 0, prevIndex) if err != nil { logger.Error("failed", err) return models.ErrActualLRPCannotBeCrashed } if immediateRestart { auctionErr := db.requestLRPAuctionForLRPKey(logger, key) if auctionErr != nil { return auctionErr } } logger.Info("succeeded") return nil }