예제 #1
0
func (test crashInfoBackoffTest) Test() {
	Context(fmt.Sprintf("when the crashCount is %d and the wait time is %s", test.CrashCount, test.WaitTime), func() {
		It("should NOT restart before the expected wait time", func() {
			calc := models.NewDefaultRestartCalculator()
			currentTimestamp := test.GetSince() + test.WaitTime.Nanoseconds() - time.Second.Nanoseconds()
			Expect(test.ShouldRestartCrash(time.Unix(0, currentTimestamp), calc)).To(BeFalse())
		})

		It("should restart after the expected wait time", func() {
			calc := models.NewDefaultRestartCalculator()
			currentTimestamp := test.GetSince() + test.WaitTime.Nanoseconds()
			Expect(test.ShouldRestartCrash(time.Unix(0, currentTimestamp), calc)).To(BeTrue())
		})
	})
}
예제 #2
0
func (db *ETCDDB) ConvergeLRPs(logger lager.Logger, cellSet models.CellSet) ([]*auctioneer.LRPStartRequest, []*models.ActualLRPKeyWithSchedulingInfo, []*models.ActualLRPKey) {
	convergeStart := db.clock.Now()
	convergeLRPRunsCounter.Increment()
	logger = logger.Session("etcd")
	logger.Info("starting-convergence")
	defer logger.Info("finished-convergence")

	defer func() {
		err := convergeLRPDuration.Send(time.Since(convergeStart))
		if err != nil {
			logger.Error("failed-sending-converge-lrp-duration-metric", err)
		}
	}()

	logger.Debug("gathering-convergence-input")
	input, err := db.GatherAndPruneLRPs(logger, cellSet)
	if err != nil {
		logger.Error("failed-gathering-convergence-input", err)
		return nil, nil, nil
	}
	logger.Debug("succeeded-gathering-convergence-input")

	changes := CalculateConvergence(logger, db.clock, models.NewDefaultRestartCalculator(), input)

	return db.ResolveConvergence(logger, input.DesiredLRPs, changes)
}
예제 #3
0
func (db *ETCDDB) CrashActualLRP(logger lager.Logger, key *models.ActualLRPKey, instanceKey *models.ActualLRPInstanceKey, errorMessage string) (*models.ActualLRPGroup, *models.ActualLRPGroup, bool, error) {
	logger = logger.WithData(lager.Data{"actual_lrp_key": key, "actual_lrp_instance_key": instanceKey})
	logger.Info("starting")

	lrp, prevIndex, err := db.rawActualLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index)
	if err != nil {
		logger.Error("failed-to-get-actual-lrp", err)
		return nil, nil, false, err
	}
	beforeActualLRP := *lrp

	latestChangeTime := time.Duration(db.clock.Now().UnixNano() - lrp.Since)

	var newCrashCount int32
	if latestChangeTime > models.CrashResetTimeout && lrp.State == models.ActualLRPStateRunning {
		newCrashCount = 1
	} else {
		newCrashCount = lrp.CrashCount + 1
	}

	logger.Debug("retrieved-lrp")
	if !lrp.AllowsTransitionTo(key, instanceKey, models.ActualLRPStateCrashed) {
		logger.Error("failed-to-transition-to-crashed", nil, lager.Data{"from_state": lrp.State, "same_instance_key": lrp.ActualLRPInstanceKey.Equal(instanceKey)})
		return nil, nil, false, models.ErrActualLRPCannotBeCrashed
	}

	lrp.State = models.ActualLRPStateCrashed
	lrp.Since = db.clock.Now().UnixNano()
	lrp.CrashCount = newCrashCount
	lrp.ActualLRPInstanceKey = models.ActualLRPInstanceKey{}
	lrp.ActualLRPNetInfo = models.EmptyActualLRPNetInfo()
	lrp.ModificationTag.Increment()
	lrp.CrashReason = errorMessage

	var immediateRestart bool
	if lrp.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) {
		lrp.State = models.ActualLRPStateUnclaimed
		immediateRestart = true
	}

	lrpData, serializeErr := db.serializeModel(logger, lrp)
	if serializeErr != nil {
		return nil, nil, false, serializeErr
	}

	_, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), lrpData, 0, prevIndex)
	if err != nil {
		logger.Error("failed", err)
		return nil, nil, false, models.ErrActualLRPCannotBeCrashed
	}

	logger.Info("succeeded")
	return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: lrp}, immediateRestart, nil
}
예제 #4
0
func (test crashInfoAlwaysStartTest) Test() {
	Context(fmt.Sprintf("when the crashCount is %d", test.CrashCount), func() {
		It("should restart regardless of the wait time", func() {
			calc := models.NewDefaultRestartCalculator()
			theFuture := test.GetSince() + time.Hour.Nanoseconds()
			Expect(test.ShouldRestartCrash(time.Unix(0, 0), calc)).To(BeTrue())
			Expect(test.ShouldRestartCrash(time.Unix(0, test.GetSince()), calc)).To(BeTrue())
			Expect(test.ShouldRestartCrash(time.Unix(0, theFuture), calc)).To(BeTrue())
		})
	})
}
예제 #5
0
// Adds CRASHED Actual LRPs that can be restarted to the list of start requests
// and transitions them to UNCLAIMED.
func (c *convergence) crashedActualLRPs(logger lager.Logger, now time.Time) {
	logger = logger.Session("crashed-actual-lrps")
	restartCalculator := models.NewDefaultRestartCalculator()

	rows, err := c.selectCrashedLRPs(logger, c.db)
	if err != nil {
		logger.Error("failed-query", err)
		return
	}

	for rows.Next() {
		var index int
		actual := &models.ActualLRP{}

		schedulingInfo, err := c.fetchDesiredLRPSchedulingInfoAndMore(logger, rows, &index, &actual.Since, &actual.CrashCount)
		if err != nil {
			continue
		}

		actual.ActualLRPKey = models.NewActualLRPKey(schedulingInfo.ProcessGuid, int32(index), schedulingInfo.Domain)
		actual.State = models.ActualLRPStateCrashed

		if actual.ShouldRestartCrash(now, restartCalculator) {
			c.submit(func() {
				_, _, err = c.UnclaimActualLRP(logger, &actual.ActualLRPKey)
				if err != nil {
					logger.Error("failed-unclaiming-actual-lrp", err)
					return
				}

				c.addStartRequestFromSchedulingInfo(logger, schedulingInfo, index)
			})
		}
	}

	if rows.Err() != nil {
		logger.Error("failed-getting-next-row", rows.Err())
	}

	return
}
예제 #6
0
func (db *SQLDB) CrashActualLRP(logger lager.Logger, key *models.ActualLRPKey, instanceKey *models.ActualLRPInstanceKey, crashReason string) (*models.ActualLRPGroup, *models.ActualLRPGroup, bool, error) {
	logger = logger.WithData(lager.Data{"key": key, "instance_key": instanceKey, "crash_reason": crashReason})
	logger.Info("starting")
	defer logger.Info("complete")

	var immediateRestart = false
	var beforeActualLRP models.ActualLRP
	var actualLRP *models.ActualLRP

	err := db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error {
		var err error
		actualLRP, err = db.fetchActualLRPForUpdate(logger, key.ProcessGuid, key.Index, false, tx)
		if err != nil {
			logger.Error("failed-to-get-actual-lrp", err)
			return err
		}
		beforeActualLRP = *actualLRP

		latestChangeTime := time.Duration(db.clock.Now().UnixNano() - actualLRP.Since)

		var newCrashCount int32
		if latestChangeTime > models.CrashResetTimeout && actualLRP.State == models.ActualLRPStateRunning {
			newCrashCount = 1
		} else {
			newCrashCount = actualLRP.CrashCount + 1
		}

		if !actualLRP.AllowsTransitionTo(&actualLRP.ActualLRPKey, instanceKey, models.ActualLRPStateCrashed) {
			logger.Error("failed-to-transition-to-crashed", nil, lager.Data{"from_state": actualLRP.State, "same_instance_key": actualLRP.ActualLRPInstanceKey.Equal(instanceKey)})
			return models.ErrActualLRPCannotBeCrashed
		}

		actualLRP.ModificationTag.Increment()
		actualLRP.State = models.ActualLRPStateCrashed

		actualLRP.ActualLRPInstanceKey.InstanceGuid = ""
		actualLRP.ActualLRPInstanceKey.CellId = ""
		actualLRP.ActualLRPNetInfo = models.ActualLRPNetInfo{}
		actualLRP.CrashCount = newCrashCount
		actualLRP.CrashReason = crashReason
		evacuating := false

		if actualLRP.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) {
			actualLRP.State = models.ActualLRPStateUnclaimed
			immediateRestart = true
		}

		now := db.clock.Now().UnixNano()
		actualLRP.Since = now

		_, err = db.update(logger, tx, actualLRPsTable,
			SQLAttributes{
				"state":                  actualLRP.State,
				"cell_id":                actualLRP.CellId,
				"instance_guid":          actualLRP.InstanceGuid,
				"modification_tag_index": actualLRP.ModificationTag.Index,
				"crash_count":            actualLRP.CrashCount,
				"crash_reason":           truncateString(actualLRP.CrashReason, 1024),
				"since":                  actualLRP.Since,
				"net_info":               []byte{},
			},
			"process_guid = ? AND instance_index = ? AND evacuating = ?",
			key.ProcessGuid, key.Index, evacuating,
		)
		if err != nil {
			logger.Error("failed-to-crash-actual-lrp", err)
			return db.convertSQLError(err)
		}

		return nil
	})

	return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: actualLRP}, immediateRestart, err
}
예제 #7
0
			Expect(calc.ShouldRestart(0, 0, 3)).To(BeFalse())
			Expect(calc.ShouldRestart(nanoseconds(30), 0, 3)).To(BeTrue())

			Expect(calc.ShouldRestart(nanoseconds(30), 0, 4)).To(BeFalse())
			Expect(calc.ShouldRestart(nanoseconds(59), 0, 4)).To(BeFalse())
			Expect(calc.ShouldRestart(nanoseconds(60), 0, 4)).To(BeTrue())
			Expect(calc.ShouldRestart(nanoseconds(60), 0, 5)).To(BeFalse())
			Expect(calc.ShouldRestart(nanoseconds(118), 0, 5)).To(BeFalse())
			Expect(calc.ShouldRestart(nanoseconds(119), 0, 5)).To(BeTrue())
		})
	})

	Describe("Validate", func() {
		It("the default values are valid", func() {
			calc := models.NewDefaultRestartCalculator()
			Expect(calc.Validate()).NotTo(HaveOccurred())
		})

		It("invalid when MaxBackoffDuration is lower than the CrashBackoffMinDuration", func() {
			calc := models.NewRestartCalculator(models.DefaultImmediateRestarts, models.CrashBackoffMinDuration-time.Second, models.DefaultMaxRestarts)
			Expect(calc.Validate()).To(HaveOccurred())
		})
	})
})

var _ = Describe("ActualLRP", func() {
	Describe("ShouldRestartCrash", func() {
		Context("when the lpr is CRASHED", func() {
			const maxWaitTime = 16 * time.Minute
			var now = time.Now().UnixNano()