Пример #1
0
func (test crashInfoBackoffTest) Test() {
	Context(fmt.Sprintf("when the crashCount is %d and the wait time is %s", test.CrashCount, test.WaitTime), func() {
		It("should NOT restart before the expected wait time", func() {
			calc := models.NewDefaultRestartCalculator()
			currentTimestamp := test.GetSince() + test.WaitTime.Nanoseconds() - time.Second.Nanoseconds()
			Expect(test.ShouldRestartCrash(time.Unix(0, currentTimestamp), calc)).To(BeFalse())
		})

		It("should restart after the expected wait time", func() {
			calc := models.NewDefaultRestartCalculator()
			currentTimestamp := test.GetSince() + test.WaitTime.Nanoseconds()
			Expect(test.ShouldRestartCrash(time.Unix(0, currentTimestamp), calc)).To(BeTrue())
		})
	})
}
Пример #2
0
func (db *ETCDDB) ConvergeLRPs(logger lager.Logger, cellSet models.CellSet) ([]*auctioneer.LRPStartRequest, []*models.ActualLRPKeyWithSchedulingInfo, []*models.ActualLRPKey) {
	convergeStart := db.clock.Now()
	convergeLRPRunsCounter.Increment()
	logger = logger.Session("etcd")
	logger.Info("starting-convergence")
	defer logger.Info("finished-convergence")

	defer func() {
		err := convergeLRPDuration.Send(time.Since(convergeStart))
		if err != nil {
			logger.Error("failed-sending-converge-lrp-duration-metric", err)
		}
	}()

	logger.Debug("gathering-convergence-input")
	input, err := db.GatherAndPruneLRPs(logger, cellSet)
	if err != nil {
		logger.Error("failed-gathering-convergence-input", err)
		return nil, nil, nil
	}
	logger.Debug("succeeded-gathering-convergence-input")

	changes := CalculateConvergence(logger, db.clock, models.NewDefaultRestartCalculator(), input)

	return db.ResolveConvergence(logger, input.DesiredLRPs, changes)
}
Пример #3
0
func (db *ETCDDB) CrashActualLRP(logger lager.Logger, key *models.ActualLRPKey, instanceKey *models.ActualLRPInstanceKey, errorMessage string) (*models.ActualLRPGroup, *models.ActualLRPGroup, bool, error) {
	logger = logger.WithData(lager.Data{"actual_lrp_key": key, "actual_lrp_instance_key": instanceKey})
	logger.Info("starting")

	lrp, prevIndex, err := db.rawActualLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index)
	if err != nil {
		logger.Error("failed-to-get-actual-lrp", err)
		return nil, nil, false, err
	}
	beforeActualLRP := *lrp

	latestChangeTime := time.Duration(db.clock.Now().UnixNano() - lrp.Since)

	var newCrashCount int32
	if latestChangeTime > models.CrashResetTimeout && lrp.State == models.ActualLRPStateRunning {
		newCrashCount = 1
	} else {
		newCrashCount = lrp.CrashCount + 1
	}

	logger.Debug("retrieved-lrp")
	if !lrp.AllowsTransitionTo(key, instanceKey, models.ActualLRPStateCrashed) {
		logger.Error("failed-to-transition-to-crashed", nil, lager.Data{"from_state": lrp.State, "same_instance_key": lrp.ActualLRPInstanceKey.Equal(instanceKey)})
		return nil, nil, false, models.ErrActualLRPCannotBeCrashed
	}

	lrp.State = models.ActualLRPStateCrashed
	lrp.Since = db.clock.Now().UnixNano()
	lrp.CrashCount = newCrashCount
	lrp.ActualLRPInstanceKey = models.ActualLRPInstanceKey{}
	lrp.ActualLRPNetInfo = models.EmptyActualLRPNetInfo()
	lrp.ModificationTag.Increment()
	lrp.CrashReason = errorMessage

	var immediateRestart bool
	if lrp.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) {
		lrp.State = models.ActualLRPStateUnclaimed
		immediateRestart = true
	}

	lrpData, serializeErr := db.serializeModel(logger, lrp)
	if serializeErr != nil {
		return nil, nil, false, serializeErr
	}

	_, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), lrpData, 0, prevIndex)
	if err != nil {
		logger.Error("failed", err)
		return nil, nil, false, models.ErrActualLRPCannotBeCrashed
	}

	logger.Info("succeeded")
	return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: lrp}, immediateRestart, nil
}
Пример #4
0
func (test crashInfoAlwaysStartTest) Test() {
	Context(fmt.Sprintf("when the crashCount is %d", test.CrashCount), func() {
		It("should restart regardless of the wait time", func() {
			calc := models.NewDefaultRestartCalculator()
			theFuture := test.GetSince() + time.Hour.Nanoseconds()
			Expect(test.ShouldRestartCrash(time.Unix(0, 0), calc)).To(BeTrue())
			Expect(test.ShouldRestartCrash(time.Unix(0, test.GetSince()), calc)).To(BeTrue())
			Expect(test.ShouldRestartCrash(time.Unix(0, theFuture), calc)).To(BeTrue())
		})
	})
}
Пример #5
0
func (db *ETCDDB) ConvergeLRPs(logger lager.Logger) {
	convergeStart := db.clock.Now()
	convergeLRPRunsCounter.Increment()
	logger = logger.Session("converge-lrps")
	logger.Info("starting-convergence")
	defer logger.Info("finished-convergence")

	defer func() {
		convergeLRPDuration.Send(time.Since(convergeStart))
	}()

	logger.Debug("gathering-convergence-input")
	input, err := db.GatherAndPruneLRPs(logger)
	if err != nil {
		logger.Error("failed-gathering-convergence-input", err)
		return
	}
	logger.Debug("succeeded-gathering-convergence-input")

	changes := CalculateConvergence(logger, db.clock, models.NewDefaultRestartCalculator(), input)

	db.ResolveConvergence(logger, input.DesiredLRPs, changes)
}
Пример #6
0
			Expect(calc.ShouldRestart(0, 0, 3)).To(BeFalse())
			Expect(calc.ShouldRestart(nanoseconds(30), 0, 3)).To(BeTrue())

			Expect(calc.ShouldRestart(nanoseconds(30), 0, 4)).To(BeFalse())
			Expect(calc.ShouldRestart(nanoseconds(59), 0, 4)).To(BeFalse())
			Expect(calc.ShouldRestart(nanoseconds(60), 0, 4)).To(BeTrue())
			Expect(calc.ShouldRestart(nanoseconds(60), 0, 5)).To(BeFalse())
			Expect(calc.ShouldRestart(nanoseconds(118), 0, 5)).To(BeFalse())
			Expect(calc.ShouldRestart(nanoseconds(119), 0, 5)).To(BeTrue())
		})
	})

	Describe("Validate", func() {
		It("the default values are valid", func() {
			calc := models.NewDefaultRestartCalculator()
			Expect(calc.Validate()).NotTo(HaveOccurred())
		})

		It("invalid when MaxBackoffDuration is lower than the CrashBackoffMinDuration", func() {
			calc := models.NewRestartCalculator(models.DefaultImmediateRestarts, models.CrashBackoffMinDuration-time.Second, models.DefaultMaxRestarts)
			Expect(calc.Validate()).To(HaveOccurred())
		})
	})
})

var _ = Describe("ActualLRP", func() {
	Describe("ShouldRestartCrash", func() {
		Context("when the lpr is CRASHED", func() {
			const maxWaitTime = 16 * time.Minute
			var now = time.Now().UnixNano()
Пример #7
0
func (db *ETCDDB) CrashActualLRP(logger lager.Logger, request *models.CrashActualLRPRequest) *models.Error {
	key := request.ActualLrpKey
	instanceKey := request.ActualLrpInstanceKey
	errorMessage := request.ErrorMessage
	logger.Info("starting")

	lrp, prevIndex, bbsErr := db.rawActuaLLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index)
	if bbsErr != nil {
		logger.Error("failed-to-get-actual-lrp", bbsErr)
		return bbsErr
	}

	latestChangeTime := time.Duration(db.clock.Now().UnixNano() - lrp.Since)

	var newCrashCount int32
	if latestChangeTime > models.CrashResetTimeout && lrp.State == models.ActualLRPStateRunning {
		newCrashCount = 1
	} else {
		newCrashCount = lrp.CrashCount + 1
	}

	logger.Debug("retrieved-lrp")
	if !lrp.AllowsTransitionTo(key, instanceKey, models.ActualLRPStateCrashed) {
		err := fmt.Errorf("cannot transition crashed lrp from state %s to state %s", lrp.State, models.ActualLRPStateCrashed)
		logger.Error("failed-to-transition-actual", err)
		return models.ErrActualLRPCannotBeCrashed
	}

	if lrp.State == models.ActualLRPStateUnclaimed || lrp.State == models.ActualLRPStateCrashed ||
		((lrp.State == models.ActualLRPStateClaimed || lrp.State == models.ActualLRPStateRunning) &&
			!lrp.ActualLRPInstanceKey.Equal(instanceKey)) {
		return models.ErrActualLRPCannotBeCrashed
	}

	lrp.State = models.ActualLRPStateCrashed
	lrp.Since = db.clock.Now().UnixNano()
	lrp.CrashCount = newCrashCount
	lrp.ActualLRPInstanceKey = models.ActualLRPInstanceKey{}
	lrp.ActualLRPNetInfo = models.EmptyActualLRPNetInfo()
	lrp.ModificationTag.Increment()
	lrp.CrashReason = errorMessage

	var immediateRestart bool
	if lrp.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) {
		lrp.State = models.ActualLRPStateUnclaimed
		immediateRestart = true
	}

	lrpRawJSON, err := json.Marshal(lrp)
	if err != nil {
		return models.ErrSerializeJSON
	}

	_, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), string(lrpRawJSON), 0, "", prevIndex)
	if err != nil {
		logger.Error("failed", err)
		return models.ErrActualLRPCannotBeCrashed
	}

	if immediateRestart {
		auctionErr := db.requestLRPAuctionForLRPKey(logger, key)
		if err != nil {
			return auctionErr
		}
	}

	logger.Info("succeeded")
	return nil
}
Пример #8
0
func (db *ETCDDB) CrashActualLRP(logger lager.Logger, key *models.ActualLRPKey, instanceKey *models.ActualLRPInstanceKey, errorMessage string) error {
	logger = logger.Session("crash-actual-lrp", lager.Data{"actual_lrp_key": key, "actual_lrp_instance_key": instanceKey})
	logger.Info("starting")

	lrp, prevIndex, err := db.rawActuaLLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index)
	if err != nil {
		logger.Error("failed-to-get-actual-lrp", err)
		return err
	}

	latestChangeTime := time.Duration(db.clock.Now().UnixNano() - lrp.Since)

	var newCrashCount int32
	if latestChangeTime > models.CrashResetTimeout && lrp.State == models.ActualLRPStateRunning {
		newCrashCount = 1
	} else {
		newCrashCount = lrp.CrashCount + 1
	}

	logger.Debug("retrieved-lrp")
	if !lrp.AllowsTransitionTo(key, instanceKey, models.ActualLRPStateCrashed) {
		err := fmt.Errorf("cannot transition crashed lrp from state %s to state %s", lrp.State, models.ActualLRPStateCrashed)
		logger.Error("failed-to-transition-actual", err)
		return models.ErrActualLRPCannotBeCrashed
	}

	if lrp.State == models.ActualLRPStateUnclaimed || lrp.State == models.ActualLRPStateCrashed ||
		((lrp.State == models.ActualLRPStateClaimed || lrp.State == models.ActualLRPStateRunning) &&
			!lrp.ActualLRPInstanceKey.Equal(instanceKey)) {
		logger.Debug("cannot-be-crashed", lager.Data{"state": lrp.State, "same-instance-key": lrp.ActualLRPInstanceKey.Equal(instanceKey)})
		return models.ErrActualLRPCannotBeCrashed
	}

	lrp.State = models.ActualLRPStateCrashed
	lrp.Since = db.clock.Now().UnixNano()
	lrp.CrashCount = newCrashCount
	lrp.ActualLRPInstanceKey = models.ActualLRPInstanceKey{}
	lrp.ActualLRPNetInfo = models.EmptyActualLRPNetInfo()
	lrp.ModificationTag.Increment()
	lrp.CrashReason = errorMessage

	var immediateRestart bool
	if lrp.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) {
		lrp.State = models.ActualLRPStateUnclaimed
		immediateRestart = true
	}

	lrpData, serializeErr := db.serializeModel(logger, lrp)
	if serializeErr != nil {
		return serializeErr
	}

	_, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), lrpData, 0, prevIndex)
	if err != nil {
		logger.Error("failed", err)
		return models.ErrActualLRPCannotBeCrashed
	}

	if immediateRestart {
		auctionErr := db.requestLRPAuctionForLRPKey(logger, key)
		if auctionErr != nil {
			return auctionErr
		}
	}

	logger.Info("succeeded")
	return nil
}