Example #1
0
func (db *ETCDDB) batchDeleteNodes(keys []string, logger lager.Logger) {
	if len(keys) == 0 {
		return
	}

	works := []func(){}

	for _, key := range keys {
		key := key
		works = append(works, func() {
			logger.Info("deleting", lager.Data{"key": key})
			_, err := db.client.Delete(key, true)
			if err != nil {
				logger.Error("failed-to-delete", err, lager.Data{
					"key": key,
				})
			}
		})
	}

	throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works)
	if err != nil {
		logger.Error("failed-to-create-throttler", err)
	}

	throttler.Work()
	return
}
Example #2
0
func (db *ETCDDB) batchDeleteTasks(taskGuids []string, logger lager.Logger) {
	if len(taskGuids) == 0 {
		return
	}

	works := []func(){}

	for _, taskGuid := range taskGuids {
		taskGuid := taskGuid
		works = append(works, func() {
			_, err := db.client.Delete(taskGuid, true)
			if err != nil {
				logger.Error("failed-to-delete", err, lager.Data{
					"task_guid": taskGuid,
				})
			}
		})
	}

	throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works)
	if err != nil {
		logger.Error("failed-to-create-throttler", err)
	}

	throttler.Work()
	return
}
Example #3
0
func (db *ETCDDB) ResolveConvergence(logger lager.Logger, desiredLRPs map[string]*models.DesiredLRP, changes *models.ConvergenceChanges) ([]*auctioneer.LRPStartRequest, []*models.ActualLRPKeyWithSchedulingInfo, []*models.ActualLRPKey) {
	startRequests := newStartRequests(desiredLRPs)
	for _, actual := range changes.StaleUnclaimedActualLRPs {
		startRequests.Add(logger, &actual.ActualLRPKey)
	}

	works := []func(){}

	keysToRetire := make([]*models.ActualLRPKey, len(changes.ActualLRPsForExtraIndices))
	for i, actual := range changes.ActualLRPsForExtraIndices {
		keysToRetire[i] = &actual.ActualLRPKey
	}

	keysWithMissingCells := []*models.ActualLRPKeyWithSchedulingInfo{}
	for _, actual := range changes.ActualLRPsWithMissingCells {
		desiredLRP, ok := desiredLRPs[actual.ProcessGuid]
		if !ok {
			logger.Debug("actual-with-missing-cell-no-desired")
			continue
		}

		schedInfo := desiredLRP.DesiredLRPSchedulingInfo()

		key := &models.ActualLRPKeyWithSchedulingInfo{
			Key:            &actual.ActualLRPKey,
			SchedulingInfo: &schedInfo,
		}

		keysWithMissingCells = append(keysWithMissingCells, key)
	}

	for _, actualKey := range changes.ActualLRPKeysForMissingIndices {
		works = append(works, db.resolveActualsWithMissingIndices(logger, desiredLRPs[actualKey.ProcessGuid], actualKey, startRequests))
	}

	for _, actual := range changes.RestartableCrashedActualLRPs {
		works = append(works, db.resolveRestartableCrashedActualLRPS(logger, actual, startRequests))
	}

	throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works)
	if err != nil {
		logger.Error("failed-constructing-throttler", err, lager.Data{"max_workers": db.convergenceWorkersSize, "num_works": len(works)})
		return nil, nil, nil
	}

	logger.Debug("waiting-for-lrp-convergence-work")
	throttler.Work()
	logger.Debug("done-waiting-for-lrp-convergence-work")

	return startRequests.Slice(), keysWithMissingCells, keysToRetire
}
Example #4
0
func (db *ETCDDB) deleteLeaves(logger lager.Logger, keys []string) error {
	works := []func(){}

	for _, key := range keys {
		key := key
		works = append(works, func() {
			_, err := db.client.DeleteDir(key)
			if err != nil {
				logger.Error("failed-deleting-leaf-node", err, lager.Data{"key": key})
			}
		})
	}

	throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works)
	if err != nil {
		return err
	}

	throttler.Work()

	return nil
}
Example #5
0
func (h *DesiredLRPHandler) createUnclaimedActualLRPs(logger lager.Logger, keys []*models.ActualLRPKey) []int {
	count := len(keys)
	createdIndicesChan := make(chan int, count)

	works := make([]func(), count)
	logger = logger.Session("create-unclaimed-actual-lrp")
	for i, key := range keys {
		key := key
		works[i] = func() {
			logger.Info("starting", lager.Data{"actual_lrp_key": key})
			actualLRPGroup, err := h.actualLRPDB.CreateUnclaimedActualLRP(logger, key)
			if err != nil {
				logger.Info("failed", lager.Data{"actual_lrp_key": key, "err_message": err.Error()})
			} else {
				go h.actualHub.Emit(models.NewActualLRPCreatedEvent(actualLRPGroup))
				createdIndicesChan <- int(key.Index)
			}
		}
	}

	throttlerSize := h.updateWorkersCount
	throttler, err := workpool.NewThrottler(throttlerSize, works)
	if err != nil {
		logger.Error("failed-constructing-throttler", err, lager.Data{"max_workers": throttlerSize, "num_works": len(works)})
		return []int{}
	}

	go func() {
		throttler.Work()
		close(createdIndicesChan)
	}()

	createdIndices := make([]int, 0, count)
	for createdIndex := range createdIndicesChan {
		createdIndices = append(createdIndices, createdIndex)
	}

	return createdIndices
}
Example #6
0
func (db *ETCDDB) batchCompareAndSwapTasks(tasksToCAS []compareAndSwappableTask, logger lager.Logger) error {
	if len(tasksToCAS) == 0 {
		return nil
	}

	works := []func(){}

	for _, taskToCAS := range tasksToCAS {
		task := taskToCAS.NewTask
		task.UpdatedAt = db.clock.Now().UnixNano()
		value, err := db.serializeModel(logger, task)
		if err != nil {
			logger.Error("failed-to-marshal", err, lager.Data{
				"task_guid": task.TaskGuid,
			})
			continue
		}

		index := taskToCAS.OldIndex
		works = append(works, func() {
			_, err := db.client.CompareAndSwap(TaskSchemaPathByGuid(task.TaskGuid), value, NO_TTL, index)
			if err != nil {
				logger.Error("failed-to-compare-and-swap", err, lager.Data{
					"task_guid": task.TaskGuid,
				})
			}
		})
	}

	throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works)
	if err != nil {
		return err
	}

	throttler.Work()
	return nil
}
Example #7
0
func (db *ETCDDB) GatherAndPruneDesiredLRPs(logger lager.Logger, guids map[string]struct{}, lmc *LRPMetricCounter) (map[string]*models.DesiredLRP, error) {
	desiredLRPsRoot, modelErr := db.fetchRecursiveRaw(logger, DesiredLRPComponentsSchemaRoot)

	if modelErr == models.ErrResourceNotFound {
		logger.Info("actual-lrp-schema-root-not-found")
		return map[string]*models.DesiredLRP{}, nil
	}

	if modelErr != nil {
		return nil, modelErr
	}

	schedulingInfos := map[string]*models.DesiredLRPSchedulingInfo{}
	runInfos := map[string]*models.DesiredLRPRunInfo{}

	var malformedSchedulingInfos, malformedRunInfos []string

	var guidsLock, schedulingInfosLock, runInfosLock sync.Mutex

	works := []func(){}
	logger.Debug("walking-desired-lrp-components-tree")

	for _, componentRoot := range desiredLRPsRoot.Nodes {
		switch componentRoot.Key {
		case DesiredLRPSchedulingInfoSchemaRoot:
			for _, node := range componentRoot.Nodes {
				node := node
				works = append(works, func() {
					var schedulingInfo models.DesiredLRPSchedulingInfo
					err := db.deserializeModel(logger, node, &schedulingInfo)
					if err != nil || schedulingInfo.Validate() != nil {
						logger.Error("failed-to-deserialize-scheduling-info", err)
						schedulingInfosLock.Lock()
						malformedSchedulingInfos = append(malformedSchedulingInfos, node.Key)
						schedulingInfosLock.Unlock()
					} else {
						schedulingInfosLock.Lock()
						schedulingInfos[schedulingInfo.ProcessGuid] = &schedulingInfo
						schedulingInfosLock.Unlock()
						atomic.AddInt32(&lmc.desiredLRPs, schedulingInfo.Instances)

						guidsLock.Lock()
						guids[schedulingInfo.ProcessGuid] = struct{}{}
						guidsLock.Unlock()
					}
				})
			}
		case DesiredLRPRunInfoSchemaRoot:
			for _, node := range componentRoot.Nodes {
				node := node
				works = append(works, func() {
					var runInfo models.DesiredLRPRunInfo
					err := db.deserializeModel(logger, node, &runInfo)
					if err != nil || runInfo.Validate() != nil {
						runInfosLock.Lock()
						malformedRunInfos = append(malformedRunInfos, node.Key)
						runInfosLock.Unlock()
					} else {
						runInfosLock.Lock()
						runInfos[runInfo.ProcessGuid] = &runInfo
						runInfosLock.Unlock()
					}
				})
			}
		default:
			err := fmt.Errorf("unrecognized node under desired LRPs root node: %s", componentRoot.Key)
			logger.Error("unrecognized-node", err)
			return nil, err
		}
	}

	throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works)
	if err != nil {
		logger.Error("failed-to-create-throttler", err)
	}

	throttler.Work()

	db.batchDeleteNodes(malformedSchedulingInfos, logger)
	db.batchDeleteNodes(malformedRunInfos, logger)

	malformedSchedulingInfosMetric.Add(uint64(len(malformedSchedulingInfos)))
	malformedRunInfosMetric.Add(uint64(len(malformedRunInfos)))

	logger.Debug("done-walking-desired-lrp-tree")

	desireds := make(map[string]*models.DesiredLRP)
	var schedInfosToDelete []string
	for guid, schedulingInfo := range schedulingInfos {
		runInfo, ok := runInfos[guid]
		if !ok {
			err := fmt.Errorf("Missing runInfo for GUID %s", guid)
			logger.Error("runInfo-not-found-error", err)
			schedInfosToDelete = append(schedInfosToDelete, DesiredLRPSchedulingInfoSchemaPath(guid))
		} else {
			desiredLRP := models.NewDesiredLRP(*schedulingInfo, *runInfo)
			desireds[guid] = &desiredLRP
		}
	}
	db.batchDeleteNodes(schedInfosToDelete, logger)

	// Check to see if we have orphaned RunInfos
	if len(runInfos) != len(schedulingInfos) {
		var runInfosToDelete []string
		for guid, runInfo := range runInfos {
			// If there is no corresponding SchedulingInfo and the RunInfo has
			// existed for longer than desiredLRPCreationTimeout, consider it orphaned
			// and delete it.
			_, ok := schedulingInfos[guid]
			if !ok && db.clock.Since(time.Unix(0, runInfo.CreatedAt)) > db.desiredLRPCreationTimeout {
				orphanedRunInfosMetric.Add(1)
				runInfosToDelete = append(runInfosToDelete, DesiredLRPRunInfoSchemaPath(guid))
			}
		}

		db.batchDeleteNodes(runInfosToDelete, logger)
	}

	return desireds, nil
}
Example #8
0
func (db *ETCDDB) gatherAndOptionallyPruneActualLRPs(logger lager.Logger, guids map[string]struct{}, doPrune bool, lmc *LRPMetricCounter) (map[string]map[int32]*models.ActualLRP, error) {
	response, modelErr := db.fetchRecursiveRaw(logger, ActualLRPSchemaRoot)

	if modelErr == models.ErrResourceNotFound {
		logger.Info("actual-lrp-schema-root-not-found")
		return map[string]map[int32]*models.ActualLRP{}, nil
	}

	if modelErr != nil {
		return nil, modelErr
	}

	actuals := map[string]map[int32]*models.ActualLRP{}
	var guidKeysToDelete, indexKeysToDelete []string
	var actualsToDelete []string
	var guidsLock, actualsLock, guidKeysToDeleteLock, indexKeysToDeleteLock,
		crashingDesiredsLock, actualsToDeleteLock sync.Mutex

	logger.Debug("walking-actual-lrp-tree")
	works := []func(){}
	crashingDesireds := map[string]struct{}{}

	for _, guidGroup := range response.Nodes {
		guidGroup := guidGroup
		works = append(works, func() {
			guidGroupWillBeEmpty := true

			for _, indexGroup := range guidGroup.Nodes {
				indexGroupWillBeEmpty := true

				for _, actualNode := range indexGroup.Nodes {
					actual := new(models.ActualLRP)
					err := db.deserializeModel(logger, actualNode, actual)
					if err != nil {
						actualsToDeleteLock.Lock()
						actualsToDelete = append(actualsToDelete, actualNode.Key)
						actualsToDeleteLock.Unlock()

						continue
					}

					err = actual.Validate()
					if err != nil {
						actualsToDeleteLock.Lock()
						actualsToDelete = append(actualsToDelete, actualNode.Key)
						actualsToDeleteLock.Unlock()

						continue
					}

					indexGroupWillBeEmpty = false
					guidGroupWillBeEmpty = false

					switch actual.State {
					case models.ActualLRPStateUnclaimed:
						atomic.AddInt32(&lmc.unclaimedLRPs, 1)
					case models.ActualLRPStateClaimed:
						atomic.AddInt32(&lmc.claimedLRPs, 1)
					case models.ActualLRPStateRunning:
						atomic.AddInt32(&lmc.runningLRPs, 1)
					case models.ActualLRPStateCrashed:
						crashingDesiredsLock.Lock()
						crashingDesireds[actual.ProcessGuid] = struct{}{}
						crashingDesiredsLock.Unlock()
						atomic.AddInt32(&lmc.crashedActualLRPs, 1)
					}

					guidsLock.Lock()
					guids[actual.ProcessGuid] = struct{}{}
					guidsLock.Unlock()

					if path.Base(actualNode.Key) == ActualLRPInstanceKey {
						actualsLock.Lock()
						if actuals[actual.ProcessGuid] == nil {
							actuals[actual.ProcessGuid] = map[int32]*models.ActualLRP{}
						}
						actuals[actual.ProcessGuid][actual.Index] = actual
						actualsLock.Unlock()
					}
				}

				if indexGroupWillBeEmpty {
					indexKeysToDeleteLock.Lock()
					indexKeysToDelete = append(indexKeysToDelete, indexGroup.Key)
					indexKeysToDeleteLock.Unlock()
				}
			}

			if guidGroupWillBeEmpty {
				guidKeysToDeleteLock.Lock()
				guidKeysToDelete = append(guidKeysToDelete, guidGroup.Key)
				guidKeysToDeleteLock.Unlock()
			}
		})
	}
	logger.Debug("done-walking-actual-lrp-tree")

	throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works)
	if err != nil {
		logger.Error("failed-to-create-throttler", err)
	}

	throttler.Work()

	if doPrune {
		logger.Info("deleting-invalid-actual-lrps", lager.Data{"num_lrps": len(actualsToDelete)})
		db.batchDeleteNodes(actualsToDelete, logger)
		actualLRPsDeleted.Add(uint64(len(actualsToDelete)))

		logger.Info("deleting-empty-actual-indices", lager.Data{"num_indices": len(indexKeysToDelete)})
		err = db.deleteLeaves(logger, indexKeysToDelete)
		if err != nil {
			logger.Error("failed-deleting-empty-actual-indices", err, lager.Data{"num_indices": len(indexKeysToDelete)})
		} else {
			logger.Info("succeeded-deleting-empty-actual-indices", lager.Data{"num_indices": len(indexKeysToDelete)})
		}

		logger.Info("deleting-empty-actual-guids", lager.Data{"num_guids": len(guidKeysToDelete)})
		err = db.deleteLeaves(logger, guidKeysToDelete)
		if err != nil {
			logger.Error("failed-deleting-empty-actual-guids", err, lager.Data{"num_guids": len(guidKeysToDelete)})
		} else {
			logger.Info("succeeded-deleting-empty-actual-guids", lager.Data{"num_guids": len(guidKeysToDelete)})
		}
	}

	lmc.crashingDesiredLRPs = int32(len(crashingDesireds))

	return actuals, nil
}
func (h *LRPConvergenceController) ConvergeLRPs(logger lager.Logger) error {
	logger = h.logger.Session("converge-lrps")
	var err error

	logger.Debug("listing-cells")
	var cellSet models.CellSet
	cellSet, err = h.serviceClient.Cells(logger)
	if err == models.ErrResourceNotFound {
		logger.Info("no-cells-found")
		cellSet = models.CellSet{}
	} else if err != nil {
		logger.Error("failed-listing-cells", err)
		// convergence should run again later
		return nil
	}
	logger.Debug("succeeded-listing-cells")

	startRequests, keysWithMissingCells, keysToRetire := h.db.ConvergeLRPs(logger, cellSet)

	retireLogger := logger.WithData(lager.Data{"retiring_lrp_count": len(keysToRetire)})
	works := []func(){}
	for _, key := range keysToRetire {
		key := key
		works = append(works, func() {
			h.retirer.RetireActualLRP(retireLogger, key.ProcessGuid, key.Index)
		})
	}

	errChan := make(chan *models.Error, 1)

	startRequestLock := &sync.Mutex{}
	for _, key := range keysWithMissingCells {
		key := key
		works = append(works, func() {
			before, after, err := h.db.UnclaimActualLRP(logger, key.Key)
			if err == nil {
				h.actualHub.Emit(models.NewActualLRPChangedEvent(before, after))
				startRequest := auctioneer.NewLRPStartRequestFromSchedulingInfo(key.SchedulingInfo, int(key.Key.Index))
				startRequestLock.Lock()
				startRequests = append(startRequests, &startRequest)
				startRequestLock.Unlock()
			} else {
				bbsErr := models.ConvertError(err)
				if bbsErr.GetType() != models.Error_Unrecoverable {
					return
				}

				logger.Error("unrecoverable-error", bbsErr)
				select {
				case errChan <- bbsErr:
				default:
				}
			}
		})
	}

	var throttler *workpool.Throttler
	throttler, err = workpool.NewThrottler(h.convergenceWorkersSize, works)
	if err != nil {
		logger.Error("failed-constructing-throttler", err, lager.Data{"max_workers": h.convergenceWorkersSize, "num_works": len(works)})
		return nil
	}

	retireLogger.Debug("retiring-actual-lrps")
	throttler.Work()
	retireLogger.Debug("done-retiring-actual-lrps")

	select {
	case err := <-errChan:
		return err
	default:
	}

	startLogger := logger.WithData(lager.Data{"start_requests_count": len(startRequests)})
	if len(startRequests) > 0 {
		startLogger.Debug("requesting-start-auctions")
		err = h.auctioneerClient.RequestLRPAuctions(logger, startRequests)
		if err != nil {
			startLogger.Error("failed-to-request-starts", err, lager.Data{"lrp_start_auctions": startRequests})
		}
		startLogger.Debug("done-requesting-start-auctions")
	}

	return nil
}