Example #1
0
func (db *ETCDDB) GatherAndPruneDesiredLRPs(logger lager.Logger, guids map[string]struct{}, lmc *LRPMetricCounter) (map[string]*models.DesiredLRP, error) {
	desiredLRPsRoot, modelErr := db.fetchRecursiveRaw(logger, DesiredLRPComponentsSchemaRoot)

	if modelErr == models.ErrResourceNotFound {
		logger.Info("actual-lrp-schema-root-not-found")
		return map[string]*models.DesiredLRP{}, nil
	}

	if modelErr != nil {
		return nil, modelErr
	}

	schedulingInfos := map[string]*models.DesiredLRPSchedulingInfo{}
	runInfos := map[string]*models.DesiredLRPRunInfo{}

	var malformedSchedulingInfos, malformedRunInfos []string

	var guidsLock, schedulingInfosLock, runInfosLock sync.Mutex

	works := []func(){}
	logger.Debug("walking-desired-lrp-components-tree")

	for _, componentRoot := range desiredLRPsRoot.Nodes {
		switch componentRoot.Key {
		case DesiredLRPSchedulingInfoSchemaRoot:
			for _, node := range componentRoot.Nodes {
				node := node
				works = append(works, func() {
					var schedulingInfo models.DesiredLRPSchedulingInfo
					err := db.deserializeModel(logger, node, &schedulingInfo)
					if err != nil || schedulingInfo.Validate() != nil {
						logger.Error("failed-to-deserialize-scheduling-info", err)
						schedulingInfosLock.Lock()
						malformedSchedulingInfos = append(malformedSchedulingInfos, node.Key)
						schedulingInfosLock.Unlock()
					} else {
						schedulingInfosLock.Lock()
						schedulingInfos[schedulingInfo.ProcessGuid] = &schedulingInfo
						schedulingInfosLock.Unlock()
						atomic.AddInt32(&lmc.desiredLRPs, schedulingInfo.Instances)

						guidsLock.Lock()
						guids[schedulingInfo.ProcessGuid] = struct{}{}
						guidsLock.Unlock()
					}
				})
			}
		case DesiredLRPRunInfoSchemaRoot:
			for _, node := range componentRoot.Nodes {
				node := node
				works = append(works, func() {
					var runInfo models.DesiredLRPRunInfo
					err := db.deserializeModel(logger, node, &runInfo)
					if err != nil || runInfo.Validate() != nil {
						runInfosLock.Lock()
						malformedRunInfos = append(malformedRunInfos, node.Key)
						runInfosLock.Unlock()
					} else {
						runInfosLock.Lock()
						runInfos[runInfo.ProcessGuid] = &runInfo
						runInfosLock.Unlock()
					}
				})
			}
		default:
			err := fmt.Errorf("unrecognized node under desired LRPs root node: %s", componentRoot.Key)
			logger.Error("unrecognized-node", err)
			return nil, err
		}
	}

	throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works)
	if err != nil {
		logger.Error("failed-to-create-throttler", err)
	}

	throttler.Work()

	db.batchDeleteNodes(malformedSchedulingInfos, logger)
	db.batchDeleteNodes(malformedRunInfos, logger)

	malformedSchedulingInfosMetric.Add(uint64(len(malformedSchedulingInfos)))
	malformedRunInfosMetric.Add(uint64(len(malformedRunInfos)))

	logger.Debug("done-walking-desired-lrp-tree")

	desireds := make(map[string]*models.DesiredLRP)
	var schedInfosToDelete []string
	for guid, schedulingInfo := range schedulingInfos {
		runInfo, ok := runInfos[guid]
		if !ok {
			err := fmt.Errorf("Missing runInfo for GUID %s", guid)
			logger.Error("runInfo-not-found-error", err)
			schedInfosToDelete = append(schedInfosToDelete, DesiredLRPSchedulingInfoSchemaPath(guid))
		} else {
			desiredLRP := models.NewDesiredLRP(*schedulingInfo, *runInfo)
			desireds[guid] = &desiredLRP
		}
	}
	db.batchDeleteNodes(schedInfosToDelete, logger)

	// Check to see if we have orphaned RunInfos
	if len(runInfos) != len(schedulingInfos) {
		var runInfosToDelete []string
		for guid, runInfo := range runInfos {
			// If there is no corresponding SchedulingInfo and the RunInfo has
			// existed for longer than desiredLRPCreationTimeout, consider it orphaned
			// and delete it.
			_, ok := schedulingInfos[guid]
			if !ok && db.clock.Since(time.Unix(0, runInfo.CreatedAt)) > db.desiredLRPCreationTimeout {
				orphanedRunInfosMetric.Add(1)
				runInfosToDelete = append(runInfosToDelete, DesiredLRPRunInfoSchemaPath(guid))
			}
		}

		db.batchDeleteNodes(runInfosToDelete, logger)
	}

	return desireds, nil
}
func (b *TimeoutToMilliseconds) Up(logger lager.Logger) error {
	response, err := b.storeClient.Get(etcd.TaskSchemaRoot, false, true)
	if err != nil {
		logger.Error("failed-fetching-tasks", err)
	}

	if response != nil {
		for _, node := range response.Node.Nodes {
			task := new(models.Task)
			err := b.serializer.Unmarshal(logger, []byte(node.Value), task)
			if err != nil {
				logger.Error("failed-to-deserialize-task", err)
				continue
			}

			updateTimeoutInAction(logger, task.Action)

			value, err := b.serializer.Marshal(logger, format.ENCODED_PROTO, task)
			if err != nil {
				return err
			}

			_, err = b.storeClient.CompareAndSwap(node.Key, value, etcd.NO_TTL, node.ModifiedIndex)
			if err != nil {
				return err
			}
		}
	}

	// Do DesiredLRP update
	response, err = b.storeClient.Get(etcd.DesiredLRPRunInfoSchemaRoot, false, true)
	if err != nil {
		logger.Error("failed-fetching-desired-lrp-run-info", err)
	}

	if response != nil {
		for _, node := range response.Node.Nodes {
			runInfo := new(models.DesiredLRPRunInfo)
			err := b.serializer.Unmarshal(logger, []byte(node.Value), runInfo)
			if err != nil {
				logger.Error("failed-to-deserialize-desired-lrp-run-info", err)
				continue
			}
			logger.Info("update-run-info", lager.Data{"deprecated_timeout_ns": runInfo.DeprecatedStartTimeoutS})
			runInfo.StartTimeoutMs = int64(runInfo.DeprecatedStartTimeoutS) * 1000
			updateTimeoutInAction(logger, runInfo.GetMonitor())
			updateTimeoutInAction(logger, runInfo.GetSetup())
			updateTimeoutInAction(logger, runInfo.GetAction())

			value, err := b.serializer.Marshal(logger, format.ENCODED_PROTO, runInfo)
			if err != nil {
				return err
			}

			_, err = b.storeClient.CompareAndSwap(node.Key, value, etcd.NO_TTL, node.ModifiedIndex)
			if err != nil {
				return err
			}
		}
	}

	return nil
}