func (test crashInfoBackoffTest) Test() { Context(fmt.Sprintf("when the crashCount is %d and the wait time is %s", test.CrashCount, test.WaitTime), func() { It("should NOT restart before the expected wait time", func() { calc := models.NewDefaultRestartCalculator() currentTimestamp := test.GetSince() + test.WaitTime.Nanoseconds() - time.Second.Nanoseconds() Expect(test.ShouldRestartCrash(time.Unix(0, currentTimestamp), calc)).To(BeFalse()) }) It("should restart after the expected wait time", func() { calc := models.NewDefaultRestartCalculator() currentTimestamp := test.GetSince() + test.WaitTime.Nanoseconds() Expect(test.ShouldRestartCrash(time.Unix(0, currentTimestamp), calc)).To(BeTrue()) }) }) }
func (db *ETCDDB) ConvergeLRPs(logger lager.Logger, cellSet models.CellSet) ([]*auctioneer.LRPStartRequest, []*models.ActualLRPKeyWithSchedulingInfo, []*models.ActualLRPKey) { convergeStart := db.clock.Now() convergeLRPRunsCounter.Increment() logger = logger.Session("etcd") logger.Info("starting-convergence") defer logger.Info("finished-convergence") defer func() { err := convergeLRPDuration.Send(time.Since(convergeStart)) if err != nil { logger.Error("failed-sending-converge-lrp-duration-metric", err) } }() logger.Debug("gathering-convergence-input") input, err := db.GatherAndPruneLRPs(logger, cellSet) if err != nil { logger.Error("failed-gathering-convergence-input", err) return nil, nil, nil } logger.Debug("succeeded-gathering-convergence-input") changes := CalculateConvergence(logger, db.clock, models.NewDefaultRestartCalculator(), input) return db.ResolveConvergence(logger, input.DesiredLRPs, changes) }
func (db *ETCDDB) CrashActualLRP(logger lager.Logger, key *models.ActualLRPKey, instanceKey *models.ActualLRPInstanceKey, errorMessage string) (*models.ActualLRPGroup, *models.ActualLRPGroup, bool, error) { logger = logger.WithData(lager.Data{"actual_lrp_key": key, "actual_lrp_instance_key": instanceKey}) logger.Info("starting") lrp, prevIndex, err := db.rawActualLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index) if err != nil { logger.Error("failed-to-get-actual-lrp", err) return nil, nil, false, err } beforeActualLRP := *lrp latestChangeTime := time.Duration(db.clock.Now().UnixNano() - lrp.Since) var newCrashCount int32 if latestChangeTime > models.CrashResetTimeout && lrp.State == models.ActualLRPStateRunning { newCrashCount = 1 } else { newCrashCount = lrp.CrashCount + 1 } logger.Debug("retrieved-lrp") if !lrp.AllowsTransitionTo(key, instanceKey, models.ActualLRPStateCrashed) { logger.Error("failed-to-transition-to-crashed", nil, lager.Data{"from_state": lrp.State, "same_instance_key": lrp.ActualLRPInstanceKey.Equal(instanceKey)}) return nil, nil, false, models.ErrActualLRPCannotBeCrashed } lrp.State = models.ActualLRPStateCrashed lrp.Since = db.clock.Now().UnixNano() lrp.CrashCount = newCrashCount lrp.ActualLRPInstanceKey = models.ActualLRPInstanceKey{} lrp.ActualLRPNetInfo = models.EmptyActualLRPNetInfo() lrp.ModificationTag.Increment() lrp.CrashReason = errorMessage var immediateRestart bool if lrp.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) { lrp.State = models.ActualLRPStateUnclaimed immediateRestart = true } lrpData, serializeErr := db.serializeModel(logger, lrp) if serializeErr != nil { return nil, nil, false, serializeErr } _, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), lrpData, 0, prevIndex) if err != nil { logger.Error("failed", err) return nil, nil, false, models.ErrActualLRPCannotBeCrashed } logger.Info("succeeded") return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: lrp}, immediateRestart, nil }
func (test crashInfoAlwaysStartTest) Test() { Context(fmt.Sprintf("when the crashCount is %d", test.CrashCount), func() { It("should restart regardless of the wait time", func() { calc := models.NewDefaultRestartCalculator() theFuture := test.GetSince() + time.Hour.Nanoseconds() Expect(test.ShouldRestartCrash(time.Unix(0, 0), calc)).To(BeTrue()) Expect(test.ShouldRestartCrash(time.Unix(0, test.GetSince()), calc)).To(BeTrue()) Expect(test.ShouldRestartCrash(time.Unix(0, theFuture), calc)).To(BeTrue()) }) }) }
// Adds CRASHED Actual LRPs that can be restarted to the list of start requests // and transitions them to UNCLAIMED. func (c *convergence) crashedActualLRPs(logger lager.Logger, now time.Time) { logger = logger.Session("crashed-actual-lrps") restartCalculator := models.NewDefaultRestartCalculator() rows, err := c.selectCrashedLRPs(logger, c.db) if err != nil { logger.Error("failed-query", err) return } for rows.Next() { var index int actual := &models.ActualLRP{} schedulingInfo, err := c.fetchDesiredLRPSchedulingInfoAndMore(logger, rows, &index, &actual.Since, &actual.CrashCount) if err != nil { continue } actual.ActualLRPKey = models.NewActualLRPKey(schedulingInfo.ProcessGuid, int32(index), schedulingInfo.Domain) actual.State = models.ActualLRPStateCrashed if actual.ShouldRestartCrash(now, restartCalculator) { c.submit(func() { _, _, err = c.UnclaimActualLRP(logger, &actual.ActualLRPKey) if err != nil { logger.Error("failed-unclaiming-actual-lrp", err) return } c.addStartRequestFromSchedulingInfo(logger, schedulingInfo, index) }) } } if rows.Err() != nil { logger.Error("failed-getting-next-row", rows.Err()) } return }
func (db *SQLDB) CrashActualLRP(logger lager.Logger, key *models.ActualLRPKey, instanceKey *models.ActualLRPInstanceKey, crashReason string) (*models.ActualLRPGroup, *models.ActualLRPGroup, bool, error) { logger = logger.WithData(lager.Data{"key": key, "instance_key": instanceKey, "crash_reason": crashReason}) logger.Info("starting") defer logger.Info("complete") var immediateRestart = false var beforeActualLRP models.ActualLRP var actualLRP *models.ActualLRP err := db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error { var err error actualLRP, err = db.fetchActualLRPForUpdate(logger, key.ProcessGuid, key.Index, false, tx) if err != nil { logger.Error("failed-to-get-actual-lrp", err) return err } beforeActualLRP = *actualLRP latestChangeTime := time.Duration(db.clock.Now().UnixNano() - actualLRP.Since) var newCrashCount int32 if latestChangeTime > models.CrashResetTimeout && actualLRP.State == models.ActualLRPStateRunning { newCrashCount = 1 } else { newCrashCount = actualLRP.CrashCount + 1 } if !actualLRP.AllowsTransitionTo(&actualLRP.ActualLRPKey, instanceKey, models.ActualLRPStateCrashed) { logger.Error("failed-to-transition-to-crashed", nil, lager.Data{"from_state": actualLRP.State, "same_instance_key": actualLRP.ActualLRPInstanceKey.Equal(instanceKey)}) return models.ErrActualLRPCannotBeCrashed } actualLRP.ModificationTag.Increment() actualLRP.State = models.ActualLRPStateCrashed actualLRP.ActualLRPInstanceKey.InstanceGuid = "" actualLRP.ActualLRPInstanceKey.CellId = "" actualLRP.ActualLRPNetInfo = models.ActualLRPNetInfo{} actualLRP.CrashCount = newCrashCount actualLRP.CrashReason = crashReason evacuating := false if actualLRP.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) { actualLRP.State = models.ActualLRPStateUnclaimed immediateRestart = true } now := db.clock.Now().UnixNano() actualLRP.Since = now _, err = db.update(logger, tx, actualLRPsTable, SQLAttributes{ "state": actualLRP.State, "cell_id": actualLRP.CellId, "instance_guid": actualLRP.InstanceGuid, "modification_tag_index": actualLRP.ModificationTag.Index, "crash_count": actualLRP.CrashCount, "crash_reason": truncateString(actualLRP.CrashReason, 1024), "since": actualLRP.Since, "net_info": []byte{}, }, "process_guid = ? AND instance_index = ? AND evacuating = ?", key.ProcessGuid, key.Index, evacuating, ) if err != nil { logger.Error("failed-to-crash-actual-lrp", err) return db.convertSQLError(err) } return nil }) return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: actualLRP}, immediateRestart, err }
Expect(calc.ShouldRestart(0, 0, 3)).To(BeFalse()) Expect(calc.ShouldRestart(nanoseconds(30), 0, 3)).To(BeTrue()) Expect(calc.ShouldRestart(nanoseconds(30), 0, 4)).To(BeFalse()) Expect(calc.ShouldRestart(nanoseconds(59), 0, 4)).To(BeFalse()) Expect(calc.ShouldRestart(nanoseconds(60), 0, 4)).To(BeTrue()) Expect(calc.ShouldRestart(nanoseconds(60), 0, 5)).To(BeFalse()) Expect(calc.ShouldRestart(nanoseconds(118), 0, 5)).To(BeFalse()) Expect(calc.ShouldRestart(nanoseconds(119), 0, 5)).To(BeTrue()) }) }) Describe("Validate", func() { It("the default values are valid", func() { calc := models.NewDefaultRestartCalculator() Expect(calc.Validate()).NotTo(HaveOccurred()) }) It("invalid when MaxBackoffDuration is lower than the CrashBackoffMinDuration", func() { calc := models.NewRestartCalculator(models.DefaultImmediateRestarts, models.CrashBackoffMinDuration-time.Second, models.DefaultMaxRestarts) Expect(calc.Validate()).To(HaveOccurred()) }) }) }) var _ = Describe("ActualLRP", func() { Describe("ShouldRestartCrash", func() { Context("when the lpr is CRASHED", func() { const maxWaitTime = 16 * time.Minute var now = time.Now().UnixNano()