Пример #1
0
func LRPInstances(
	actualLRPGroups []*models.ActualLRPGroup,
	addInfo func(*cc_messages.LRPInstance, *models.ActualLRP),
	clk clock.Clock,
) []cc_messages.LRPInstance {
	instances := make([]cc_messages.LRPInstance, len(actualLRPGroups))
	for i, actualLRPGroup := range actualLRPGroups {
		actual, _ := actualLRPGroup.Resolve()

		instance := cc_messages.LRPInstance{
			ProcessGuid:  actual.ProcessGuid,
			InstanceGuid: actual.InstanceGuid,
			Index:        uint(actual.Index),
			Since:        actual.Since / 1e9,
			Uptime:       (clk.Now().UnixNano() - actual.Since) / 1e9,
			State:        cc_conv.StateFor(actual.State),
		}

		if addInfo != nil {
			addInfo(&instance, actual)
		}

		instances[i] = instance
	}

	return instances
}
Пример #2
0
// Until we get a successful response from garden,
// periodically emit metrics saying how long we've been trying
// while retrying the connection indefinitely.
func waitForGarden(logger lager.Logger, gardenClient GardenClient.Client, clock clock.Clock) error {
	pingStart := clock.Now()
	logger = logger.Session("wait-for-garden", lager.Data{"initialTime:": pingStart})
	pingRequest := clock.NewTimer(0)
	pingResponse := make(chan error)
	heartbeatTimer := clock.NewTimer(StalledMetricHeartbeatInterval)

	for {
		select {
		case <-pingRequest.C():
			go func() {
				logger.Info("ping-garden", lager.Data{"wait-time-ns:": clock.Since(pingStart)})
				pingResponse <- gardenClient.Ping()
			}()

		case err := <-pingResponse:
			switch err.(type) {
			case nil:
				logger.Info("ping-garden-success", lager.Data{"wait-time-ns:": clock.Since(pingStart)})
				// send 0 to indicate ping responded successfully
				stalledDuration.Send(0)
				return nil
			case garden.UnrecoverableError:
				logger.Error("failed-to-ping-garden-with-unrecoverable-error", err)
				return err
			default:
				logger.Error("failed-to-ping-garden", err)
				pingRequest.Reset(PingGardenInterval)
			}

		case <-heartbeatTimer.C():
			logger.Info("emitting-stalled-garden-heartbeat", lager.Data{"wait-time-ns:": clock.Since(pingStart)})
			stalledDuration.Send(clock.Since(pingStart))
			heartbeatTimer.Reset(StalledMetricHeartbeatInterval)
		}
	}
}
Пример #3
0
func (sender *Sender) Send(clock clock.Clock) error {
	sender.currentTime = clock.Now()
	err := sender.store.VerifyFreshness(sender.currentTime)
	if err != nil {
		sender.logger.Error("Store is not fresh", err)
		return err
	}

	pendingStartMessages, err := sender.store.GetPendingStartMessages()
	if err != nil {
		sender.logger.Error("Failed to fetch pending start messages", err)
		return err
	}

	pendingStopMessages, err := sender.store.GetPendingStopMessages()
	if err != nil {
		sender.logger.Error("Failed to fetch pending stop messages", err)
		return err
	}

	sender.apps, err = sender.store.GetApps()
	if err != nil {
		sender.logger.Error("Failed to fetch apps", err)
		return err
	}

	sender.sendStartMessages(pendingStartMessages)
	sender.sendStopMessages(pendingStopMessages)

	err = sender.metricsAccountant.IncrementSentMessageMetrics(sender.sentStartMessages, sender.sentStopMessages)
	if err != nil {
		sender.logger.Error("Failed to increment metrics", err)
		sender.didSucceed = false
	}

	err = sender.store.SavePendingStartMessages(sender.startMessagesToSave...)
	if err != nil {
		sender.logger.Error("Failed to save start messages", err)
		sender.didSucceed = false
	}

	err = sender.store.DeletePendingStartMessages(sender.startMessagesToDelete...)
	if err != nil {
		sender.logger.Error("Failed to delete start messages", err)
		sender.didSucceed = false
	}

	err = sender.store.SavePendingStopMessages(sender.stopMessagesToSave...)
	if err != nil {
		sender.logger.Error("Failed to save stop messages", err)
		sender.didSucceed = false
	}

	err = sender.store.DeletePendingStopMessages(sender.stopMessagesToDelete...)
	if err != nil {
		sender.logger.Error("Failed to delete stop messages", err)
		sender.didSucceed = false
	}

	if !sender.didSucceed {
		return errors.New("Sender failed. See logs for details.")
	}

	return nil
}
Пример #4
0
func CalculateConvergence(
	logger lager.Logger,
	clock clock.Clock,
	restartCalculator models.RestartCalculator,
	input *models.ConvergenceInput,
) *models.ConvergenceChanges {
	sess := logger.Session("calculate-convergence")

	var extraLRPCount, missingLRPCount int

	sess.Info("start")
	defer sess.Info("done")

	changes := &models.ConvergenceChanges{}

	now := clock.Now()

	for processGuid, _ := range input.AllProcessGuids {
		pLog := sess.WithData(lager.Data{
			"process_guid": processGuid,
		})

		desired, hasDesired := input.DesiredLRPs[processGuid]

		actualsByIndex := input.ActualLRPs[processGuid]

		if hasDesired {
			for i := int32(0); i < desired.Instances; i++ {
				if _, hasIndex := actualsByIndex[i]; !hasIndex {
					pLog.Info("missing", lager.Data{"index": i})
					missingLRPCount++
					lrpKey := models.NewActualLRPKey(desired.ProcessGuid, i, desired.Domain)
					changes.ActualLRPKeysForMissingIndices = append(
						changes.ActualLRPKeysForMissingIndices,
						&lrpKey,
					)
				}
			}

			for i, actual := range actualsByIndex {
				if actual.CellIsMissing(input.Cells) {
					pLog.Info("missing-cell", lager.Data{"index": i, "cell_id": actual.CellId})
					changes.ActualLRPsWithMissingCells = append(changes.ActualLRPsWithMissingCells, actual)
					continue
				}

				if actual.Index >= desired.Instances && input.Domains.Contains(desired.Domain) {
					pLog.Info("extra", lager.Data{"index": i})
					extraLRPCount++
					changes.ActualLRPsForExtraIndices = append(changes.ActualLRPsForExtraIndices, actual)
					continue
				}

				if actual.ShouldRestartCrash(now, restartCalculator) {
					pLog.Info("restart-crash", lager.Data{"index": i})
					changes.RestartableCrashedActualLRPs = append(changes.RestartableCrashedActualLRPs, actual)
					continue
				}

				if actual.ShouldStartUnclaimed(now) {
					pLog.Info("stale-unclaimed", lager.Data{"index": i})
					changes.StaleUnclaimedActualLRPs = append(changes.StaleUnclaimedActualLRPs, actual)
					continue
				}
			}
		} else {
			for i, actual := range actualsByIndex {
				if !input.Domains.Contains(actual.Domain) {
					pLog.Info("skipping-unfresh-domain")
					continue
				}

				pLog.Info("no-longer-desired", lager.Data{"index": i})
				extraLRPCount++
				changes.ActualLRPsForExtraIndices = append(changes.ActualLRPsForExtraIndices, actual)
			}
		}
	}

	missingLRPs.Send(missingLRPCount)
	extraLRPs.Send(extraLRPCount)

	return changes
}
Пример #5
0
func dumpApp(app *models.App, starts map[string]models.PendingStartMessage, stops map[string]models.PendingStopMessage, clock clock.Clock) {
	fmt.Printf("\n")
	fmt.Printf("Guid: %s | Version: %s\n", app.AppGuid, app.AppVersion)
	if app.IsDesired() {
		fmt.Printf("  Desired: [%d] instances, (%s, %s)\n", app.Desired.NumberOfInstances, app.Desired.State, app.Desired.PackageState)
	} else {
		fmt.Printf("  Desired: NO\n")
	}

	if len(app.InstanceHeartbeats) == 0 {
		fmt.Printf("  Heartbeats: NONE\n")
	} else {
		fmt.Printf("  Heartbeats:\n")
		for _, heartbeat := range app.InstanceHeartbeats {
			fmt.Printf("    [%d %s] %s on %s\n", heartbeat.InstanceIndex, heartbeat.State, heartbeat.InstanceGuid, heartbeat.DeaGuid[0:5])
		}
	}

	if len(app.CrashCounts) != 0 {
		fmt.Printf("  CrashCounts:")
		for _, crashCount := range app.CrashCounts {
			fmt.Printf(" [%d]:%d", crashCount.InstanceIndex, crashCount.CrashCount)
		}
		fmt.Printf("\n")
	}

	appStarts := []models.PendingStartMessage{}
	appStops := []models.PendingStopMessage{}

	for _, start := range starts {
		if start.AppGuid == app.AppGuid && start.AppVersion == app.AppVersion {
			appStarts = append(appStarts, start)
		}
	}

	for _, stop := range stops {
		if stop.AppGuid == app.AppGuid && stop.AppVersion == app.AppVersion {
			appStops = append(appStops, stop)
		}
	}

	if len(appStarts) > 0 {
		fmt.Printf("  Pending Starts:\n")
		for _, start := range appStarts {
			message := []string{}
			message = append(message, fmt.Sprintf("[%d]", start.IndexToStart))
			message = append(message, fmt.Sprintf("priority:%.2f", start.Priority))
			if start.SkipVerification {
				message = append(message, "NO VERIFICATION")
			}
			if start.SentOn != 0 {
				message = append(message, "send:SENT")
				message = append(message, fmt.Sprintf("delete:%s", time.Unix(start.SentOn+int64(start.KeepAlive), 0).Sub(clock.Now())))
			} else {
				message = append(message, fmt.Sprintf("send:%s", time.Unix(start.SendOn, 0).Sub(clock.Now())))
			}

			fmt.Printf("    %s\n", strings.Join(message, " "))
		}
	}

	if len(appStops) > 0 {
		fmt.Printf("  Pending Stops:\n")
		for _, stop := range appStops {
			message := []string{}
			message = append(message, stop.InstanceGuid)
			if stop.SentOn != 0 {
				message = append(message, "send:SENT")
				message = append(message, fmt.Sprintf("delete:%s", time.Unix(stop.SentOn+int64(stop.KeepAlive), 0).Sub(clock.Now())))
			} else {
				message = append(message, fmt.Sprintf("send:%s", time.Unix(stop.SendOn, 0).Sub(clock.Now())))
			}

			fmt.Printf("    %s\n", strings.Join(message, " "))
		}
	}
}