func (db *ETCDDB) batchDeleteNodes(keys []string, logger lager.Logger) { if len(keys) == 0 { return } works := []func(){} for _, key := range keys { key := key works = append(works, func() { logger.Info("deleting", lager.Data{"key": key}) _, err := db.client.Delete(key, true) if err != nil { logger.Error("failed-to-delete", err, lager.Data{ "key": key, }) } }) } throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works) if err != nil { logger.Error("failed-to-create-throttler", err) } throttler.Work() return }
func (db *ETCDDB) batchDeleteTasks(taskGuids []string, logger lager.Logger) { if len(taskGuids) == 0 { return } works := []func(){} for _, taskGuid := range taskGuids { taskGuid := taskGuid works = append(works, func() { _, err := db.client.Delete(taskGuid, true) if err != nil { logger.Error("failed-to-delete", err, lager.Data{ "task_guid": taskGuid, }) } }) } throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works) if err != nil { logger.Error("failed-to-create-throttler", err) } throttler.Work() return }
func (db *ETCDDB) ResolveConvergence(logger lager.Logger, desiredLRPs map[string]*models.DesiredLRP, changes *models.ConvergenceChanges) ([]*auctioneer.LRPStartRequest, []*models.ActualLRPKeyWithSchedulingInfo, []*models.ActualLRPKey) { startRequests := newStartRequests(desiredLRPs) for _, actual := range changes.StaleUnclaimedActualLRPs { startRequests.Add(logger, &actual.ActualLRPKey) } works := []func(){} keysToRetire := make([]*models.ActualLRPKey, len(changes.ActualLRPsForExtraIndices)) for i, actual := range changes.ActualLRPsForExtraIndices { keysToRetire[i] = &actual.ActualLRPKey } keysWithMissingCells := []*models.ActualLRPKeyWithSchedulingInfo{} for _, actual := range changes.ActualLRPsWithMissingCells { desiredLRP, ok := desiredLRPs[actual.ProcessGuid] if !ok { logger.Debug("actual-with-missing-cell-no-desired") continue } schedInfo := desiredLRP.DesiredLRPSchedulingInfo() key := &models.ActualLRPKeyWithSchedulingInfo{ Key: &actual.ActualLRPKey, SchedulingInfo: &schedInfo, } keysWithMissingCells = append(keysWithMissingCells, key) } for _, actualKey := range changes.ActualLRPKeysForMissingIndices { works = append(works, db.resolveActualsWithMissingIndices(logger, desiredLRPs[actualKey.ProcessGuid], actualKey, startRequests)) } for _, actual := range changes.RestartableCrashedActualLRPs { works = append(works, db.resolveRestartableCrashedActualLRPS(logger, actual, startRequests)) } throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works) if err != nil { logger.Error("failed-constructing-throttler", err, lager.Data{"max_workers": db.convergenceWorkersSize, "num_works": len(works)}) return nil, nil, nil } logger.Debug("waiting-for-lrp-convergence-work") throttler.Work() logger.Debug("done-waiting-for-lrp-convergence-work") return startRequests.Slice(), keysWithMissingCells, keysToRetire }
func (db *ETCDDB) deleteLeaves(logger lager.Logger, keys []string) error { works := []func(){} for _, key := range keys { key := key works = append(works, func() { _, err := db.client.DeleteDir(key) if err != nil { logger.Error("failed-deleting-leaf-node", err, lager.Data{"key": key}) } }) } throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works) if err != nil { return err } throttler.Work() return nil }
func (h *DesiredLRPHandler) createUnclaimedActualLRPs(logger lager.Logger, keys []*models.ActualLRPKey) []int { count := len(keys) createdIndicesChan := make(chan int, count) works := make([]func(), count) logger = logger.Session("create-unclaimed-actual-lrp") for i, key := range keys { key := key works[i] = func() { logger.Info("starting", lager.Data{"actual_lrp_key": key}) actualLRPGroup, err := h.actualLRPDB.CreateUnclaimedActualLRP(logger, key) if err != nil { logger.Info("failed", lager.Data{"actual_lrp_key": key, "err_message": err.Error()}) } else { go h.actualHub.Emit(models.NewActualLRPCreatedEvent(actualLRPGroup)) createdIndicesChan <- int(key.Index) } } } throttlerSize := h.updateWorkersCount throttler, err := workpool.NewThrottler(throttlerSize, works) if err != nil { logger.Error("failed-constructing-throttler", err, lager.Data{"max_workers": throttlerSize, "num_works": len(works)}) return []int{} } go func() { throttler.Work() close(createdIndicesChan) }() createdIndices := make([]int, 0, count) for createdIndex := range createdIndicesChan { createdIndices = append(createdIndices, createdIndex) } return createdIndices }
func (db *ETCDDB) batchCompareAndSwapTasks(tasksToCAS []compareAndSwappableTask, logger lager.Logger) error { if len(tasksToCAS) == 0 { return nil } works := []func(){} for _, taskToCAS := range tasksToCAS { task := taskToCAS.NewTask task.UpdatedAt = db.clock.Now().UnixNano() value, err := db.serializeModel(logger, task) if err != nil { logger.Error("failed-to-marshal", err, lager.Data{ "task_guid": task.TaskGuid, }) continue } index := taskToCAS.OldIndex works = append(works, func() { _, err := db.client.CompareAndSwap(TaskSchemaPathByGuid(task.TaskGuid), value, NO_TTL, index) if err != nil { logger.Error("failed-to-compare-and-swap", err, lager.Data{ "task_guid": task.TaskGuid, }) } }) } throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works) if err != nil { return err } throttler.Work() return nil }
func (db *ETCDDB) GatherAndPruneDesiredLRPs(logger lager.Logger, guids map[string]struct{}, lmc *LRPMetricCounter) (map[string]*models.DesiredLRP, error) { desiredLRPsRoot, modelErr := db.fetchRecursiveRaw(logger, DesiredLRPComponentsSchemaRoot) if modelErr == models.ErrResourceNotFound { logger.Info("actual-lrp-schema-root-not-found") return map[string]*models.DesiredLRP{}, nil } if modelErr != nil { return nil, modelErr } schedulingInfos := map[string]*models.DesiredLRPSchedulingInfo{} runInfos := map[string]*models.DesiredLRPRunInfo{} var malformedSchedulingInfos, malformedRunInfos []string var guidsLock, schedulingInfosLock, runInfosLock sync.Mutex works := []func(){} logger.Debug("walking-desired-lrp-components-tree") for _, componentRoot := range desiredLRPsRoot.Nodes { switch componentRoot.Key { case DesiredLRPSchedulingInfoSchemaRoot: for _, node := range componentRoot.Nodes { node := node works = append(works, func() { var schedulingInfo models.DesiredLRPSchedulingInfo err := db.deserializeModel(logger, node, &schedulingInfo) if err != nil || schedulingInfo.Validate() != nil { logger.Error("failed-to-deserialize-scheduling-info", err) schedulingInfosLock.Lock() malformedSchedulingInfos = append(malformedSchedulingInfos, node.Key) schedulingInfosLock.Unlock() } else { schedulingInfosLock.Lock() schedulingInfos[schedulingInfo.ProcessGuid] = &schedulingInfo schedulingInfosLock.Unlock() atomic.AddInt32(&lmc.desiredLRPs, schedulingInfo.Instances) guidsLock.Lock() guids[schedulingInfo.ProcessGuid] = struct{}{} guidsLock.Unlock() } }) } case DesiredLRPRunInfoSchemaRoot: for _, node := range componentRoot.Nodes { node := node works = append(works, func() { var runInfo models.DesiredLRPRunInfo err := db.deserializeModel(logger, node, &runInfo) if err != nil || runInfo.Validate() != nil { runInfosLock.Lock() malformedRunInfos = append(malformedRunInfos, node.Key) runInfosLock.Unlock() } else { runInfosLock.Lock() runInfos[runInfo.ProcessGuid] = &runInfo runInfosLock.Unlock() } }) } default: err := fmt.Errorf("unrecognized node under desired LRPs root node: %s", componentRoot.Key) logger.Error("unrecognized-node", err) return nil, err } } throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works) if err != nil { logger.Error("failed-to-create-throttler", err) } throttler.Work() db.batchDeleteNodes(malformedSchedulingInfos, logger) db.batchDeleteNodes(malformedRunInfos, logger) malformedSchedulingInfosMetric.Add(uint64(len(malformedSchedulingInfos))) malformedRunInfosMetric.Add(uint64(len(malformedRunInfos))) logger.Debug("done-walking-desired-lrp-tree") desireds := make(map[string]*models.DesiredLRP) var schedInfosToDelete []string for guid, schedulingInfo := range schedulingInfos { runInfo, ok := runInfos[guid] if !ok { err := fmt.Errorf("Missing runInfo for GUID %s", guid) logger.Error("runInfo-not-found-error", err) schedInfosToDelete = append(schedInfosToDelete, DesiredLRPSchedulingInfoSchemaPath(guid)) } else { desiredLRP := models.NewDesiredLRP(*schedulingInfo, *runInfo) desireds[guid] = &desiredLRP } } db.batchDeleteNodes(schedInfosToDelete, logger) // Check to see if we have orphaned RunInfos if len(runInfos) != len(schedulingInfos) { var runInfosToDelete []string for guid, runInfo := range runInfos { // If there is no corresponding SchedulingInfo and the RunInfo has // existed for longer than desiredLRPCreationTimeout, consider it orphaned // and delete it. _, ok := schedulingInfos[guid] if !ok && db.clock.Since(time.Unix(0, runInfo.CreatedAt)) > db.desiredLRPCreationTimeout { orphanedRunInfosMetric.Add(1) runInfosToDelete = append(runInfosToDelete, DesiredLRPRunInfoSchemaPath(guid)) } } db.batchDeleteNodes(runInfosToDelete, logger) } return desireds, nil }
func (db *ETCDDB) gatherAndOptionallyPruneActualLRPs(logger lager.Logger, guids map[string]struct{}, doPrune bool, lmc *LRPMetricCounter) (map[string]map[int32]*models.ActualLRP, error) { response, modelErr := db.fetchRecursiveRaw(logger, ActualLRPSchemaRoot) if modelErr == models.ErrResourceNotFound { logger.Info("actual-lrp-schema-root-not-found") return map[string]map[int32]*models.ActualLRP{}, nil } if modelErr != nil { return nil, modelErr } actuals := map[string]map[int32]*models.ActualLRP{} var guidKeysToDelete, indexKeysToDelete []string var actualsToDelete []string var guidsLock, actualsLock, guidKeysToDeleteLock, indexKeysToDeleteLock, crashingDesiredsLock, actualsToDeleteLock sync.Mutex logger.Debug("walking-actual-lrp-tree") works := []func(){} crashingDesireds := map[string]struct{}{} for _, guidGroup := range response.Nodes { guidGroup := guidGroup works = append(works, func() { guidGroupWillBeEmpty := true for _, indexGroup := range guidGroup.Nodes { indexGroupWillBeEmpty := true for _, actualNode := range indexGroup.Nodes { actual := new(models.ActualLRP) err := db.deserializeModel(logger, actualNode, actual) if err != nil { actualsToDeleteLock.Lock() actualsToDelete = append(actualsToDelete, actualNode.Key) actualsToDeleteLock.Unlock() continue } err = actual.Validate() if err != nil { actualsToDeleteLock.Lock() actualsToDelete = append(actualsToDelete, actualNode.Key) actualsToDeleteLock.Unlock() continue } indexGroupWillBeEmpty = false guidGroupWillBeEmpty = false switch actual.State { case models.ActualLRPStateUnclaimed: atomic.AddInt32(&lmc.unclaimedLRPs, 1) case models.ActualLRPStateClaimed: atomic.AddInt32(&lmc.claimedLRPs, 1) case models.ActualLRPStateRunning: atomic.AddInt32(&lmc.runningLRPs, 1) case models.ActualLRPStateCrashed: crashingDesiredsLock.Lock() crashingDesireds[actual.ProcessGuid] = struct{}{} crashingDesiredsLock.Unlock() atomic.AddInt32(&lmc.crashedActualLRPs, 1) } guidsLock.Lock() guids[actual.ProcessGuid] = struct{}{} guidsLock.Unlock() if path.Base(actualNode.Key) == ActualLRPInstanceKey { actualsLock.Lock() if actuals[actual.ProcessGuid] == nil { actuals[actual.ProcessGuid] = map[int32]*models.ActualLRP{} } actuals[actual.ProcessGuid][actual.Index] = actual actualsLock.Unlock() } } if indexGroupWillBeEmpty { indexKeysToDeleteLock.Lock() indexKeysToDelete = append(indexKeysToDelete, indexGroup.Key) indexKeysToDeleteLock.Unlock() } } if guidGroupWillBeEmpty { guidKeysToDeleteLock.Lock() guidKeysToDelete = append(guidKeysToDelete, guidGroup.Key) guidKeysToDeleteLock.Unlock() } }) } logger.Debug("done-walking-actual-lrp-tree") throttler, err := workpool.NewThrottler(db.convergenceWorkersSize, works) if err != nil { logger.Error("failed-to-create-throttler", err) } throttler.Work() if doPrune { logger.Info("deleting-invalid-actual-lrps", lager.Data{"num_lrps": len(actualsToDelete)}) db.batchDeleteNodes(actualsToDelete, logger) actualLRPsDeleted.Add(uint64(len(actualsToDelete))) logger.Info("deleting-empty-actual-indices", lager.Data{"num_indices": len(indexKeysToDelete)}) err = db.deleteLeaves(logger, indexKeysToDelete) if err != nil { logger.Error("failed-deleting-empty-actual-indices", err, lager.Data{"num_indices": len(indexKeysToDelete)}) } else { logger.Info("succeeded-deleting-empty-actual-indices", lager.Data{"num_indices": len(indexKeysToDelete)}) } logger.Info("deleting-empty-actual-guids", lager.Data{"num_guids": len(guidKeysToDelete)}) err = db.deleteLeaves(logger, guidKeysToDelete) if err != nil { logger.Error("failed-deleting-empty-actual-guids", err, lager.Data{"num_guids": len(guidKeysToDelete)}) } else { logger.Info("succeeded-deleting-empty-actual-guids", lager.Data{"num_guids": len(guidKeysToDelete)}) } } lmc.crashingDesiredLRPs = int32(len(crashingDesireds)) return actuals, nil }
func (h *LRPConvergenceController) ConvergeLRPs(logger lager.Logger) error { logger = h.logger.Session("converge-lrps") var err error logger.Debug("listing-cells") var cellSet models.CellSet cellSet, err = h.serviceClient.Cells(logger) if err == models.ErrResourceNotFound { logger.Info("no-cells-found") cellSet = models.CellSet{} } else if err != nil { logger.Error("failed-listing-cells", err) // convergence should run again later return nil } logger.Debug("succeeded-listing-cells") startRequests, keysWithMissingCells, keysToRetire := h.db.ConvergeLRPs(logger, cellSet) retireLogger := logger.WithData(lager.Data{"retiring_lrp_count": len(keysToRetire)}) works := []func(){} for _, key := range keysToRetire { key := key works = append(works, func() { h.retirer.RetireActualLRP(retireLogger, key.ProcessGuid, key.Index) }) } errChan := make(chan *models.Error, 1) startRequestLock := &sync.Mutex{} for _, key := range keysWithMissingCells { key := key works = append(works, func() { before, after, err := h.db.UnclaimActualLRP(logger, key.Key) if err == nil { h.actualHub.Emit(models.NewActualLRPChangedEvent(before, after)) startRequest := auctioneer.NewLRPStartRequestFromSchedulingInfo(key.SchedulingInfo, int(key.Key.Index)) startRequestLock.Lock() startRequests = append(startRequests, &startRequest) startRequestLock.Unlock() } else { bbsErr := models.ConvertError(err) if bbsErr.GetType() != models.Error_Unrecoverable { return } logger.Error("unrecoverable-error", bbsErr) select { case errChan <- bbsErr: default: } } }) } var throttler *workpool.Throttler throttler, err = workpool.NewThrottler(h.convergenceWorkersSize, works) if err != nil { logger.Error("failed-constructing-throttler", err, lager.Data{"max_workers": h.convergenceWorkersSize, "num_works": len(works)}) return nil } retireLogger.Debug("retiring-actual-lrps") throttler.Work() retireLogger.Debug("done-retiring-actual-lrps") select { case err := <-errChan: return err default: } startLogger := logger.WithData(lager.Data{"start_requests_count": len(startRequests)}) if len(startRequests) > 0 { startLogger.Debug("requesting-start-auctions") err = h.auctioneerClient.RequestLRPAuctions(logger, startRequests) if err != nil { startLogger.Error("failed-to-request-starts", err, lager.Data{"lrp_start_auctions": startRequests}) } startLogger.Debug("done-requesting-start-auctions") } return nil }