func (h *DesiredLRPHandler) RemoveDesiredLRP(logger lager.Logger, w http.ResponseWriter, req *http.Request) { logger = logger.Session("remove-desired-lrp") request := &models.RemoveDesiredLRPRequest{} response := &models.DesiredLRPLifecycleResponse{} defer func() { exitIfUnrecoverable(logger, h.exitChan, response.Error) }() defer writeResponse(w, response) err := parseRequest(logger, req, request) if err != nil { response.Error = models.ConvertError(err) return } logger = logger.WithData(lager.Data{"process_guid": request.ProcessGuid}) desiredLRP, err := h.desiredLRPDB.DesiredLRPByProcessGuid(logger.Session("fetch-desired"), request.ProcessGuid) if err != nil { response.Error = models.ConvertError(err) return } err = h.desiredLRPDB.RemoveDesiredLRP(logger.Session("remove-desired"), request.ProcessGuid) if err != nil { response.Error = models.ConvertError(err) return } go h.desiredHub.Emit(models.NewDesiredLRPRemovedEvent(desiredLRP)) h.stopInstancesFrom(logger, request.ProcessGuid, 0) }
// The stager calls this when it wants to claim a completed task. This ensures that only one // stager ever attempts to handle a completed task func (db *ETCDDB) ResolvingTask(logger lager.Logger, taskGuid string) error { logger = logger.WithData(lager.Data{"task_guid": taskGuid}) logger.Info("starting") defer logger.Info("finished") task, index, err := db.taskByGuidWithIndex(logger, taskGuid) if err != nil { logger.Error("failed-getting-task", err) return err } err = task.ValidateTransitionTo(models.Task_Resolving) if err != nil { logger.Error("invalid-state-transition", err) return err } task.UpdatedAt = db.clock.Now().UnixNano() task.State = models.Task_Resolving value, err := db.serializeModel(logger, task) if err != nil { return err } _, err = db.client.CompareAndSwap(TaskSchemaPathByGuid(taskGuid), value, NO_TTL, index) if err != nil { return ErrorFromEtcdError(logger, err) } return nil }
// The cell calls this when it has finished running the task (be it success or failure) // stagerTaskBBS will retry this repeatedly if it gets a StoreTimeout error (up to N seconds?) // This really really shouldn't fail. If it does, blog about it and walk away. If it failed in a // consistent way (i.e. key already exists), there's probably a flaw in our design. func (db *ETCDDB) CompleteTask(logger lager.Logger, taskGuid, cellId string, failed bool, failureReason, result string) (*models.Task, error) { logger = logger.WithData(lager.Data{"task_guid": taskGuid, "cell_id": cellId}) logger.Info("starting") defer logger.Info("finished") task, index, err := db.taskByGuidWithIndex(logger, taskGuid) if err != nil { logger.Error("failed-getting-task", err) return nil, err } if task.State == models.Task_Running && task.CellId != cellId { err = models.NewRunningOnDifferentCellError(cellId, task.CellId) logger.Error("invalid-cell-id", err) return nil, err } if err = task.ValidateTransitionTo(models.Task_Completed); err != nil { logger.Error("invalid-state-transition", err) return nil, err } return task, db.completeTask(logger, task, index, failed, failureReason, result) }
func (db *ETCDDB) FailActualLRP(logger lager.Logger, key *models.ActualLRPKey, errorMessage string) (*models.ActualLRPGroup, *models.ActualLRPGroup, error) { logger = logger.WithData(lager.Data{"actual_lrp_key": key, "error_message": errorMessage}) logger.Info("starting") lrp, prevIndex, err := db.rawActualLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index) if err != nil { logger.Error("failed-to-get-actual-lrp", err) return nil, nil, err } beforeActualLRP := *lrp if lrp.State != models.ActualLRPStateUnclaimed { return nil, nil, models.ErrActualLRPCannotBeFailed } lrp.ModificationTag.Increment() lrp.PlacementError = errorMessage lrp.Since = db.clock.Now().UnixNano() lrpData, serialErr := db.serializeModel(logger, lrp) if serialErr != nil { return nil, nil, serialErr } _, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), lrpData, 0, prevIndex) if err != nil { logger.Error("failed", err) return nil, nil, models.ErrActualLRPCannotBeFailed } logger.Info("succeeded") return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: lrp}, nil }
func (db *ETCDDB) DesireTask(logger lager.Logger, taskDef *models.TaskDefinition, taskGuid, domain string) error { logger = logger.WithData(lager.Data{"task_guid": taskGuid}) logger.Info("starting") defer logger.Info("finished") now := db.clock.Now().UnixNano() task := &models.Task{ TaskDefinition: taskDef, TaskGuid: taskGuid, Domain: domain, State: models.Task_Pending, CreatedAt: now, UpdatedAt: now, } value, err := db.serializeModel(logger, task) if err != nil { return err } logger.Debug("persisting-task") _, err = db.client.Create(TaskSchemaPathByGuid(task.TaskGuid), value, NO_TTL) if err != nil { return ErrorFromEtcdError(logger, err) } logger.Debug("succeeded-persisting-task") return nil }
func HandleCompletedTask(logger lager.Logger, httpClient *http.Client, taskDB db.TaskDB, task *models.Task) { logger = logger.Session("handle-completed-task", lager.Data{"task_guid": task.TaskGuid}) if task.CompletionCallbackUrl != "" { modelErr := taskDB.ResolvingTask(logger, task.TaskGuid) if modelErr != nil { logger.Error("marking-task-as-resolving-failed", modelErr) return } logger = logger.WithData(lager.Data{"callback_url": task.CompletionCallbackUrl}) json, err := json.Marshal(&models.TaskCallbackResponse{ TaskGuid: task.TaskGuid, Failed: task.Failed, FailureReason: task.FailureReason, Result: task.Result, Annotation: task.Annotation, CreatedAt: task.CreatedAt, }) if err != nil { logger.Error("marshalling-task-failed", err) return } var statusCode int for i := 0; i < MAX_CB_RETRIES; i++ { request, err := http.NewRequest("POST", task.CompletionCallbackUrl, bytes.NewReader(json)) if err != nil { logger.Error("building-request-failed", err) return } request.Header.Set("Content-Type", "application/json") response, err := httpClient.Do(request) if err != nil { matched, _ := regexp.MatchString("Client.Timeout|use of closed network connection", err.Error()) if matched { continue } logger.Error("doing-request-failed", err) return } defer response.Body.Close() statusCode = response.StatusCode if shouldResolve(statusCode) { modelErr := taskDB.DeleteTask(logger, task.TaskGuid) if modelErr != nil { logger.Error("delete-task-failed", modelErr) } return } } logger.Info("callback-failed", lager.Data{"status_code": statusCode}) } return }
// The stager calls this when it wants to claim a completed task. This ensures that only one // stager ever attempts to handle a completed task func (db *SQLDB) ResolvingTask(logger lager.Logger, taskGuid string) error { logger = logger.WithData(lager.Data{"task_guid": taskGuid}) logger.Info("starting") defer logger.Info("complete") return db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error { task, err := db.fetchTaskForUpdate(logger, taskGuid, tx) if err != nil { logger.Error("failed-locking-task", err) return err } if err = task.ValidateTransitionTo(models.Task_Resolving); err != nil { logger.Error("invalid-state-transition", err) return err } now := db.clock.Now().UnixNano() _, err = db.update(logger, tx, tasksTable, SQLAttributes{ "state": models.Task_Resolving, "updated_at": now, }, "guid = ?", taskGuid, ) if err != nil { logger.Error("failed-updating-tasks", err) return db.convertSQLError(err) } return nil }) }
// RemoveDesiredLRP deletes the DesiredLRPSchedulingInfo and the DesiredLRPRunInfo // from the database. We delete DesiredLRPSchedulingInfo first because the system // uses it to determine wheter the lrp is present. In the event that only the // RunInfo fails to delete, the orphaned DesiredLRPRunInfo will be garbage // collected later by convergence. func (db *ETCDDB) RemoveDesiredLRP(logger lager.Logger, processGuid string) error { logger = logger.WithData(lager.Data{"process_guid": processGuid}) logger.Info("starting") defer logger.Info("complete") _, schedulingInfoErr := db.client.Delete(DesiredLRPSchedulingInfoSchemaPath(processGuid), true) schedulingInfoErr = ErrorFromEtcdError(logger, schedulingInfoErr) if schedulingInfoErr != nil && schedulingInfoErr != models.ErrResourceNotFound { logger.Error("failed-deleting-scheduling-info", schedulingInfoErr) return schedulingInfoErr } _, runInfoErr := db.client.Delete(DesiredLRPRunInfoSchemaPath(processGuid), true) runInfoErr = ErrorFromEtcdError(logger, runInfoErr) if runInfoErr != nil && runInfoErr != models.ErrResourceNotFound { logger.Error("failed-deleting-run-info", runInfoErr) return runInfoErr } if schedulingInfoErr == models.ErrResourceNotFound && runInfoErr == models.ErrResourceNotFound { // If neither component of the desired LRP exists, don't bother trying to delete running instances return models.ErrResourceNotFound } return nil }
// The cell calls this when the user requested to cancel the task // stagerTaskBBS will retry this repeatedly if it gets a StoreTimeout error (up to N seconds?) // Will fail if the task has already been cancelled or completed normally func (db *ETCDDB) CancelTask(logger lager.Logger, taskGuid string) (*models.Task, string, error) { logger = logger.WithData(lager.Data{"task_guid": taskGuid}) logger.Info("starting") defer logger.Info("finished") task, index, err := db.taskByGuidWithIndex(logger, taskGuid) if err != nil { logger.Error("failed-to-fetch-task", err) return nil, "", err } if err = task.ValidateTransitionTo(models.Task_Completed); err != nil { if task.State != models.Task_Pending { logger.Error("invalid-state-transition", err) return nil, "", err } } logger.Info("completing-task") cellID := task.CellId err = db.completeTask(logger, task, index, true, "task was cancelled", "") if err != nil { logger.Error("failed-completing-task", err) return nil, "", err } logger.Info("succeeded-completing-task") return task, cellID, nil }
func (db *SQLDB) ActualLRPGroupsByProcessGuid(logger lager.Logger, processGuid string) ([]*models.ActualLRPGroup, error) { logger = logger.WithData(lager.Data{"process_guid": processGuid}) logger.Debug("starting") defer logger.Debug("complete") return db.getActualLRPS(logger, "process_guid = ?", processGuid) }
func (db *SQLDB) ClaimActualLRP(logger lager.Logger, processGuid string, index int32, instanceKey *models.ActualLRPInstanceKey) (*models.ActualLRPGroup, *models.ActualLRPGroup, error) { logger = logger.WithData(lager.Data{"process_guid": processGuid, "index": index, "instance_key": instanceKey}) logger.Info("starting") defer logger.Info("complete") var beforeActualLRP models.ActualLRP var actualLRP *models.ActualLRP err := db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error { var err error actualLRP, err = db.fetchActualLRPForUpdate(logger, processGuid, index, false, tx) if err != nil { logger.Error("failed-fetching-actual-lrp-for-share", err) return err } beforeActualLRP = *actualLRP if !actualLRP.AllowsTransitionTo(&actualLRP.ActualLRPKey, instanceKey, models.ActualLRPStateClaimed) { logger.Error("cannot-transition-to-claimed", nil, lager.Data{"from_state": actualLRP.State, "same_instance_key": actualLRP.ActualLRPInstanceKey.Equal(instanceKey)}) return models.ErrActualLRPCannotBeClaimed } if actualLRP.State == models.ActualLRPStateClaimed && actualLRP.ActualLRPInstanceKey.Equal(instanceKey) { return nil } actualLRP.ModificationTag.Increment() actualLRP.State = models.ActualLRPStateClaimed actualLRP.ActualLRPInstanceKey = *instanceKey actualLRP.PlacementError = "" actualLRP.ActualLRPNetInfo = models.ActualLRPNetInfo{} actualLRP.Since = db.clock.Now().UnixNano() _, err = db.update(logger, tx, actualLRPsTable, SQLAttributes{ "state": actualLRP.State, "cell_id": actualLRP.CellId, "instance_guid": actualLRP.InstanceGuid, "modification_tag_index": actualLRP.ModificationTag.Index, "placement_error": actualLRP.PlacementError, "since": actualLRP.Since, "net_info": []byte{}, }, "process_guid = ? AND instance_index = ? AND evacuating = ?", processGuid, index, false, ) if err != nil { logger.Error("failed-claiming-actual-lrp", err) return db.convertSQLError(err) } return nil }) return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: actualLRP}, err }
func (db *ETCDDB) StartActualLRP(logger lager.Logger, key *models.ActualLRPKey, instanceKey *models.ActualLRPInstanceKey, netInfo *models.ActualLRPNetInfo) (*models.ActualLRPGroup, *models.ActualLRPGroup, error) { logger = logger.WithData(lager.Data{ "actual_lrp_key": key, "actual_lrp_instance_key": instanceKey, "net_info": netInfo, }) lrp, prevIndex, err := db.rawActualLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index) bbsErr := models.ConvertError(err) if bbsErr != nil { if bbsErr.Type == models.Error_ResourceNotFound { lrp, err := db.createRunningActualLRP(logger, key, instanceKey, netInfo) return nil, &models.ActualLRPGroup{Instance: lrp}, err } logger.Error("failed-to-get-actual-lrp", err) return nil, nil, err } beforeActualLRP := *lrp if lrp.ActualLRPKey.Equal(key) && lrp.ActualLRPInstanceKey.Equal(instanceKey) && lrp.ActualLRPNetInfo.Equal(netInfo) && lrp.State == models.ActualLRPStateRunning { lrpGroup := &models.ActualLRPGroup{Instance: lrp} return lrpGroup, lrpGroup, nil } if !lrp.AllowsTransitionTo(key, instanceKey, models.ActualLRPStateRunning) { logger.Error("failed-to-transition-actual-lrp-to-started", nil) return nil, nil, models.ErrActualLRPCannotBeStarted } logger.Info("starting") defer logger.Info("completed") lrp.ModificationTag.Increment() lrp.State = models.ActualLRPStateRunning lrp.Since = db.clock.Now().UnixNano() lrp.ActualLRPInstanceKey = *instanceKey lrp.ActualLRPNetInfo = *netInfo lrp.PlacementError = "" lrpData, serializeErr := db.serializeModel(logger, lrp) if serializeErr != nil { return nil, nil, serializeErr } _, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), lrpData, 0, prevIndex) if err != nil { logger.Error("failed", err) return nil, nil, models.ErrActualLRPCannotBeStarted } return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: lrp}, nil }
func (db *SQLDB) UnclaimActualLRP(logger lager.Logger, key *models.ActualLRPKey) (*models.ActualLRPGroup, *models.ActualLRPGroup, error) { logger = logger.WithData(lager.Data{"key": key}) var beforeActualLRP models.ActualLRP var actualLRP *models.ActualLRP processGuid := key.ProcessGuid index := key.Index err := db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error { var err error actualLRP, err = db.fetchActualLRPForUpdate(logger, processGuid, index, false, tx) if err != nil { logger.Error("failed-fetching-actual-lrp-for-share", err) return err } beforeActualLRP = *actualLRP if actualLRP.State == models.ActualLRPStateUnclaimed { logger.Debug("already-unclaimed") return models.ErrActualLRPCannotBeUnclaimed } logger.Info("starting") defer logger.Info("complete") now := db.clock.Now().UnixNano() actualLRP.ModificationTag.Increment() actualLRP.State = models.ActualLRPStateUnclaimed actualLRP.ActualLRPInstanceKey.CellId = "" actualLRP.ActualLRPInstanceKey.InstanceGuid = "" actualLRP.Since = now actualLRP.ActualLRPNetInfo = models.ActualLRPNetInfo{} _, err = db.update(logger, tx, actualLRPsTable, SQLAttributes{ "state": actualLRP.State, "cell_id": actualLRP.CellId, "instance_guid": actualLRP.InstanceGuid, "modification_tag_index": actualLRP.ModificationTag.Index, "since": actualLRP.Since, "net_info": []byte{}, }, "process_guid = ? AND instance_index = ? AND evacuating = ?", processGuid, index, false, ) if err != nil { logger.Error("failed-to-unclaim-actual-lrp", err) return db.convertSQLError(err) } return nil }) return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: actualLRP}, err }
func (db *ETCDDB) CrashActualLRP(logger lager.Logger, key *models.ActualLRPKey, instanceKey *models.ActualLRPInstanceKey, errorMessage string) (*models.ActualLRPGroup, *models.ActualLRPGroup, bool, error) { logger = logger.WithData(lager.Data{"actual_lrp_key": key, "actual_lrp_instance_key": instanceKey}) logger.Info("starting") lrp, prevIndex, err := db.rawActualLRPByProcessGuidAndIndex(logger, key.ProcessGuid, key.Index) if err != nil { logger.Error("failed-to-get-actual-lrp", err) return nil, nil, false, err } beforeActualLRP := *lrp latestChangeTime := time.Duration(db.clock.Now().UnixNano() - lrp.Since) var newCrashCount int32 if latestChangeTime > models.CrashResetTimeout && lrp.State == models.ActualLRPStateRunning { newCrashCount = 1 } else { newCrashCount = lrp.CrashCount + 1 } logger.Debug("retrieved-lrp") if !lrp.AllowsTransitionTo(key, instanceKey, models.ActualLRPStateCrashed) { logger.Error("failed-to-transition-to-crashed", nil, lager.Data{"from_state": lrp.State, "same_instance_key": lrp.ActualLRPInstanceKey.Equal(instanceKey)}) return nil, nil, false, models.ErrActualLRPCannotBeCrashed } lrp.State = models.ActualLRPStateCrashed lrp.Since = db.clock.Now().UnixNano() lrp.CrashCount = newCrashCount lrp.ActualLRPInstanceKey = models.ActualLRPInstanceKey{} lrp.ActualLRPNetInfo = models.EmptyActualLRPNetInfo() lrp.ModificationTag.Increment() lrp.CrashReason = errorMessage var immediateRestart bool if lrp.ShouldRestartImmediately(models.NewDefaultRestartCalculator()) { lrp.State = models.ActualLRPStateUnclaimed immediateRestart = true } lrpData, serializeErr := db.serializeModel(logger, lrp) if serializeErr != nil { return nil, nil, false, serializeErr } _, err = db.client.CompareAndSwap(ActualLRPSchemaPath(key.ProcessGuid, key.Index), lrpData, 0, prevIndex) if err != nil { logger.Error("failed", err) return nil, nil, false, models.ErrActualLRPCannotBeCrashed } logger.Info("succeeded") return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: lrp}, immediateRestart, nil }
func (db *ETCDDB) DesiredLRPs(logger lager.Logger, filter models.DesiredLRPFilter) ([]*models.DesiredLRP, error) { logger = logger.WithData(lager.Data{"filter": filter}) logger.Info("start") defer logger.Info("complete") desireds, _, err := db.desiredLRPs(logger, filter) if err != nil { logger.Error("failed", err) } return desireds, err }
func (db *SQLDB) DesiredLRPByProcessGuid(logger lager.Logger, processGuid string) (*models.DesiredLRP, error) { logger = logger.WithData(lager.Data{"process_guid": processGuid}) logger.Debug("starting") defer logger.Debug("complete") row := db.one(logger, db.db, desiredLRPsTable, desiredLRPColumns, NoLockRow, "process_guid = ?", processGuid, ) return db.fetchDesiredLRP(logger, row) }
func (h *DesiredLRPHandler) UpdateDesiredLRP(logger lager.Logger, w http.ResponseWriter, req *http.Request) { logger = logger.Session("update-desired-lrp") request := &models.UpdateDesiredLRPRequest{} response := &models.DesiredLRPLifecycleResponse{} defer func() { exitIfUnrecoverable(logger, h.exitChan, response.Error) }() defer writeResponse(w, response) err := parseRequest(logger, req, request) if err != nil { logger.Error("failed-parsing-request", err) response.Error = models.ConvertError(err) return } logger = logger.WithData(lager.Data{"guid": request.ProcessGuid}) logger.Debug("updating-desired-lrp") beforeDesiredLRP, err := h.desiredLRPDB.UpdateDesiredLRP(logger, request.ProcessGuid, request.Update) if err != nil { logger.Debug("failed-updating-desired-lrp") response.Error = models.ConvertError(err) return } logger.Debug("completed-updating-desired-lrp") desiredLRP, err := h.desiredLRPDB.DesiredLRPByProcessGuid(logger, request.ProcessGuid) if err != nil { logger.Error("failed-fetching-desired-lrp", err) return } if request.Update.Instances != nil { logger.Debug("updating-lrp-instances") previousInstanceCount := beforeDesiredLRP.Instances requestedInstances := *request.Update.Instances - previousInstanceCount logger = logger.WithData(lager.Data{"instances_delta": requestedInstances}) if requestedInstances > 0 { logger.Debug("increasing-the-instances") schedulingInfo := desiredLRP.DesiredLRPSchedulingInfo() h.startInstanceRange(logger, previousInstanceCount, *request.Update.Instances, &schedulingInfo) } if requestedInstances < 0 { logger.Debug("decreasing-the-instances") numExtraActualLRP := previousInstanceCount + requestedInstances h.stopInstancesFrom(logger, request.ProcessGuid, int(numExtraActualLRP)) } } go h.desiredHub.Emit(models.NewDesiredLRPChangedEvent(beforeDesiredLRP, desiredLRP)) }
func (db *ETCDDB) RemoveActualLRP(logger lager.Logger, processGuid string, index int32, instanceKey *models.ActualLRPInstanceKey) error { logger = logger.WithData(lager.Data{"process_guid": processGuid, "index": index, "instance_key": instanceKey}) lrp, prevIndex, err := db.rawActualLRPByProcessGuidAndIndex(logger, processGuid, index) if err != nil { return err } if instanceKey != nil && !lrp.ActualLRPInstanceKey.Equal(instanceKey) { logger.Debug("not-found", lager.Data{"actual_lrp_instance_key": lrp.ActualLRPInstanceKey, "instance_key": instanceKey}) return models.ErrResourceNotFound } return db.removeActualLRP(logger, lrp, prevIndex) }
func (db *SQLDB) UpdateDesiredLRP(logger lager.Logger, processGuid string, update *models.DesiredLRPUpdate) (*models.DesiredLRP, error) { logger = logger.WithData(lager.Data{"process_guid": processGuid}) logger.Info("starting") defer logger.Info("complete") var beforeDesiredLRP *models.DesiredLRP err := db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error { var err error row := db.one(logger, tx, desiredLRPsTable, desiredLRPColumns, LockRow, "process_guid = ?", processGuid, ) beforeDesiredLRP, err = db.fetchDesiredLRP(logger, row) if err != nil { logger.Error("failed-lock-desired", err) return err } updateAttributes := SQLAttributes{"modification_tag_index": beforeDesiredLRP.ModificationTag.Index + 1} if update.Annotation != nil { updateAttributes["annotation"] = *update.Annotation } if update.Instances != nil { updateAttributes["instances"] = *update.Instances } if update.Routes != nil { encodedData, err := db.encodeRouteData(logger, update.Routes) if err != nil { return err } updateAttributes["routes"] = encodedData } _, err = db.update(logger, tx, desiredLRPsTable, updateAttributes, `process_guid = ?`, processGuid) if err != nil { logger.Error("failed-executing-query", err) return db.convertSQLError(err) } return nil }) return beforeDesiredLRP, err }
func (db *SQLDB) FailActualLRP(logger lager.Logger, key *models.ActualLRPKey, placementError string) (*models.ActualLRPGroup, *models.ActualLRPGroup, error) { logger = logger.WithData(lager.Data{"actual_lrp_key": key, "placement_error": placementError}) logger.Info("starting") defer logger.Info("complete") var beforeActualLRP models.ActualLRP var actualLRP *models.ActualLRP err := db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error { var err error actualLRP, err = db.fetchActualLRPForUpdate(logger, key.ProcessGuid, key.Index, false, tx) if err != nil { logger.Error("failed-to-get-actual-lrp", err) return err } beforeActualLRP = *actualLRP if actualLRP.State != models.ActualLRPStateUnclaimed { logger.Error("cannot-fail-actual-lrp", nil, lager.Data{"from_state": actualLRP.State}) return models.ErrActualLRPCannotBeFailed } now := db.clock.Now().UnixNano() actualLRP.ModificationTag.Increment() actualLRP.PlacementError = placementError actualLRP.Since = now evacuating := false _, err = db.update(logger, tx, actualLRPsTable, SQLAttributes{ "modification_tag_index": actualLRP.ModificationTag.Index, "placement_error": truncateString(actualLRP.PlacementError, 1024), "since": actualLRP.Since, }, "process_guid = ? AND instance_index = ? AND evacuating = ?", key.ProcessGuid, key.Index, evacuating, ) if err != nil { logger.Error("failed-failing-actual-lrp", err) return db.convertSQLError(err) } return nil }) return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: actualLRP}, err }
func (db *ETCDDB) ClaimActualLRP(logger lager.Logger, processGuid string, index int32, instanceKey *models.ActualLRPInstanceKey) (*models.ActualLRPGroup, *models.ActualLRPGroup, error) { logger = logger.WithData(lager.Data{"process_guid": processGuid, "index": index, "actual_lrp_instance_key": instanceKey}) logger.Info("starting") lrp, prevIndex, err := db.rawActualLRPByProcessGuidAndIndex(logger, processGuid, index) if err != nil { logger.Error("failed", err) return nil, nil, err } beforeActualLRP := *lrp if !lrp.AllowsTransitionTo(&lrp.ActualLRPKey, instanceKey, models.ActualLRPStateClaimed) { return nil, nil, models.ErrActualLRPCannotBeClaimed } if lrp.State == models.ActualLRPStateClaimed && lrp.ActualLRPInstanceKey.Equal(instanceKey) { return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: lrp}, nil } lrp.PlacementError = "" lrp.State = models.ActualLRPStateClaimed lrp.ActualLRPInstanceKey = *instanceKey lrp.ActualLRPNetInfo = models.ActualLRPNetInfo{} lrp.ModificationTag.Increment() lrp.Since = db.clock.Now().UnixNano() err = lrp.Validate() if err != nil { logger.Error("failed", err) return nil, nil, models.NewError(models.Error_InvalidRecord, err.Error()) } lrpData, serializeErr := db.serializeModel(logger, lrp) if serializeErr != nil { return nil, nil, serializeErr } _, err = db.client.CompareAndSwap(ActualLRPSchemaPath(processGuid, index), lrpData, 0, prevIndex) if err != nil { logger.Error("compare-and-swap-failed", err) return nil, nil, models.ErrActualLRPCannotBeClaimed } logger.Info("succeeded") return &models.ActualLRPGroup{Instance: &beforeActualLRP}, &models.ActualLRPGroup{Instance: lrp}, nil }
func (db *SQLDB) ActualLRPGroupByProcessGuidAndIndex(logger lager.Logger, processGuid string, index int32) (*models.ActualLRPGroup, error) { logger = logger.WithData(lager.Data{"process_guid": processGuid, "index": index}) logger.Debug("starting") defer logger.Debug("complete") groups, err := db.getActualLRPS(logger, "process_guid = ? AND instance_index = ?", processGuid, index) if err != nil { return nil, err } if len(groups) == 0 { logger.Error("failed-to-find-actual-lrp-group", models.ErrResourceNotFound) return nil, models.ErrResourceNotFound } return groups[0], nil }
func (db *SQLDB) DesiredLRPSchedulingInfos(logger lager.Logger, filter models.DesiredLRPFilter) ([]*models.DesiredLRPSchedulingInfo, error) { logger = logger.WithData(lager.Data{"filter": filter}) logger.Debug("start") defer logger.Debug("complete") var wheres []string var values []interface{} if filter.Domain != "" { wheres = append(wheres, "domain = ?") values = append(values, filter.Domain) } results := []*models.DesiredLRPSchedulingInfo{} err := db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error { rows, err := db.all(logger, tx, desiredLRPsTable, schedulingInfoColumns, NoLockRow, strings.Join(wheres, " AND "), values..., ) if err != nil { logger.Error("failed-query", err) return db.convertSQLError(err) } defer rows.Close() for rows.Next() { desiredLRPSchedulingInfo, err := db.fetchDesiredLRPSchedulingInfo(logger, rows) if err != nil { logger.Error("failed-reading-row", err) continue } results = append(results, desiredLRPSchedulingInfo) } if rows.Err() != nil { logger.Error("failed-fetching-row", rows.Err()) return db.convertSQLError(rows.Err()) } return nil }) return results, err }
func (db *SQLDB) ActualLRPGroups(logger lager.Logger, filter models.ActualLRPFilter) ([]*models.ActualLRPGroup, error) { logger = logger.WithData(lager.Data{"filter": filter}) logger.Debug("starting") defer logger.Debug("complete") var wheres []string var values []interface{} if filter.Domain != "" { wheres = append(wheres, "domain = ?") values = append(values, filter.Domain) } if filter.CellID != "" { wheres = append(wheres, "cell_id = ?") values = append(values, filter.CellID) } return db.getActualLRPS(logger, strings.Join(wheres, " AND "), values...) }
func (db *SQLDB) CreateUnclaimedActualLRP(logger lager.Logger, key *models.ActualLRPKey) (*models.ActualLRPGroup, error) { logger = logger.WithData(lager.Data{"key": key}) logger.Info("starting") defer logger.Info("complete") guid, err := db.guidProvider.NextGUID() if err != nil { logger.Error("failed-to-generate-guid", err) return nil, models.ErrGUIDGeneration } now := db.clock.Now().UnixNano() err = db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error { _, err := db.insert(logger, db.db, actualLRPsTable, SQLAttributes{ "process_guid": key.ProcessGuid, "instance_index": key.Index, "domain": key.Domain, "state": models.ActualLRPStateUnclaimed, "since": now, "net_info": []byte{}, "modification_tag_epoch": guid, "modification_tag_index": 0, }, ) return err }) if err != nil { logger.Error("failed-to-create-unclaimed-actual-lrp", err) return nil, db.convertSQLError(err) } return &models.ActualLRPGroup{ Instance: &models.ActualLRP{ ActualLRPKey: *key, State: models.ActualLRPStateUnclaimed, Since: now, ModificationTag: models.ModificationTag{Epoch: guid, Index: 0}, }, }, nil }
func (db *SQLDB) DesiredLRPByProcessGuid(logger lager.Logger, processGuid string) (*models.DesiredLRP, error) { logger = logger.WithData(lager.Data{"process_guid": processGuid}) logger.Debug("starting") defer logger.Debug("complete") var desiredLRP *models.DesiredLRP err := db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error { var err error row := db.one(logger, tx, desiredLRPsTable, desiredLRPColumns, NoLockRow, "process_guid = ?", processGuid, ) desiredLRP, err = db.fetchDesiredLRP(logger, row) return err }) return desiredLRP, err }
func (db *SQLDB) RemoveDesiredLRP(logger lager.Logger, processGuid string) error { logger = logger.WithData(lager.Data{"process_guid": processGuid}) logger.Info("starting") defer logger.Info("complete") return db.transact(logger, func(logger lager.Logger, tx *sql.Tx) error { err := db.lockDesiredLRPByGuidForUpdate(logger, processGuid, tx) if err != nil { logger.Error("failed-lock-desired", err) return err } _, err = db.delete(logger, tx, desiredLRPsTable, "process_guid = ?", processGuid) if err != nil { logger.Error("failed-deleting-from-db", err) return db.convertSQLError(err) } return nil }) }
// The stager calls this when it wants to signal that it has received a completion and is handling it // stagerTaskBBS will retry this repeatedly if it gets a StoreTimeout error (up to N seconds?) // If this fails, the stager should assume that someone else is handling the completion and should bail func (db *ETCDDB) DeleteTask(logger lager.Logger, taskGuid string) error { logger = logger.WithData(lager.Data{"task_guid": taskGuid}) logger.Info("starting") defer logger.Info("finished") task, _, err := db.taskByGuidWithIndex(logger, taskGuid) if err != nil { logger.Error("failed-getting-task", err) return err } if task.State != models.Task_Resolving { err = models.NewTaskTransitionError(task.State, models.Task_Resolving) logger.Error("invalid-state-transition", err) return err } _, err = db.client.Delete(TaskSchemaPathByGuid(taskGuid), false) return ErrorFromEtcdError(logger, err) }
func (db *ETCDDB) DesiredLRPSchedulingInfos(logger lager.Logger, filter models.DesiredLRPFilter) ([]*models.DesiredLRPSchedulingInfo, error) { logger = logger.WithData(lager.Data{"filter": filter}) logger.Info("start") defer logger.Info("complete") root, err := db.fetchRecursiveRaw(logger, DesiredLRPSchedulingInfoSchemaRoot) bbsErr := models.ConvertError(err) if bbsErr != nil { if bbsErr.Type == models.Error_ResourceNotFound { return []*models.DesiredLRPSchedulingInfo{}, nil } return nil, err } schedulingInfoMap, _ := db.deserializeScheduleInfos(logger, root.Nodes, filter) schedulingInfos := make([]*models.DesiredLRPSchedulingInfo, 0, len(schedulingInfoMap)) for _, schedulingInfo := range schedulingInfoMap { schedulingInfos = append(schedulingInfos, schedulingInfo) } return schedulingInfos, nil }
func (db *ETCDDB) StartTask(logger lager.Logger, taskGuid, cellID string) (bool, error) { logger.Debug("starting") defer logger.Debug("finished") task, index, err := db.taskByGuidWithIndex(logger, taskGuid) if err != nil { logger.Error("failed-to-fetch-task", err) return false, err } logger = logger.WithData(lager.Data{"task": task.LagerData()}) if task.State == models.Task_Running && task.CellId == cellID { logger.Info("task-already-running") return false, nil } if err = task.ValidateTransitionTo(models.Task_Running); err != nil { return false, err } task.UpdatedAt = db.clock.Now().UnixNano() task.State = models.Task_Running task.CellId = cellID value, err := db.serializeModel(logger, task) if err != nil { return false, err } _, err = db.client.CompareAndSwap(TaskSchemaPathByGuid(taskGuid), value, NO_TTL, index) if err != nil { logger.Error("failed-persisting-task", err) return false, ErrorFromEtcdError(logger, err) } return true, nil }