func (k *inMemoryRegistry) UpdateStatus(status *mesos.TaskStatus) (*T, StateType) { taskId := status.GetTaskId().GetValue() k.rw.Lock() defer k.rw.Unlock() task, state := k._get(taskId) switch status.GetState() { case mesos.TaskState_TASK_STAGING: k.handleTaskStaging(task, state, status) case mesos.TaskState_TASK_STARTING: k.handleTaskStarting(task, state, status) case mesos.TaskState_TASK_RUNNING: k.handleTaskRunning(task, state, status) case mesos.TaskState_TASK_FINISHED: k.handleTaskFinished(task, state, status) case mesos.TaskState_TASK_FAILED: k.handleTaskFailed(task, state, status) case mesos.TaskState_TASK_ERROR: k.handleTaskError(task, state, status) case mesos.TaskState_TASK_KILLED: k.handleTaskKilled(task, state, status) case mesos.TaskState_TASK_LOST: k.handleTaskLost(task, state, status) default: log.Warningf("unhandled status update for task: %v", taskId) } return task.Clone(), state }
func (sched *NoneScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { taskId := status.GetTaskId().GetValue() log.Infoln("Status update: task", taskId, "is in state", status.State.Enum().String()) c := sched.queue.GetCommandById(taskId) if c == nil { log.Errorln("Unable to find command for task", taskId) driver.Abort() } if c.Status.GetState() == status.GetState() { // ignore repeated status updates return } c.Status = status // send status update to CommandHandler if status.GetState() == mesos.TaskState_TASK_RUNNING { sched.handler.CommandRunning(c) } else if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.handler.CommandEnded(c) sched.handler.CommandFinished(c) } else if status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED { sched.handler.CommandEnded(c) sched.handler.CommandFailed(c) } // stop if Commands channel was closed and all tasks are finished if sched.queue.Closed() && !sched.handler.HasRunningTasks() { log.Infoln("All tasks finished, stopping framework.") sched.handler.FinishAllCommands() driver.Stop(false) } }
func (sched *Scheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ log.Infoln("%v of %v tasks finished.", sched.tasksFinished, sched.totalTasks) } //TODO if a job is finished, failed, error, lost, killed // figure out how this impacts dependent jobs and update job graph /* //never shut down framework! if sched.tasksFinished >= sched.totalTasks { log.Infoln("Total tasks completed, stopping framework.") driver.Stop(false) } */ /* if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } */ }
// mesos.Scheduler interface method. // Invoked when the status of a task has changed. func (this *TransformScheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) { fmt.Printf("Status update: task %s is in state %s\n", status.TaskId.GetValue(), status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_FINISHED { this.removeTask(status.GetTaskId()) this.decRunningInstances() } }
func (sched *Scheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ log.Infoln("%v of %v tasks finished.", sched.tasksFinished, sched.totalTasks) } }
// StatusUpdate is called when a status update message is sent to the scheduler. func (k *KubernetesScheduler) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) { source, reason := "none", "none" if taskStatus.Source != nil { source = (*taskStatus.Source).String() } if taskStatus.Reason != nil { reason = (*taskStatus.Reason).String() } taskState := taskStatus.GetState() metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc() log.Infof( "task status update %q from %q for task %q on slave %q executor %q for reason %q", taskState.String(), source, taskStatus.TaskId.GetValue(), taskStatus.SlaveId.GetValue(), taskStatus.ExecutorId.GetValue(), reason) switch taskState { case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING: if _, state := k.taskRegistry.UpdateStatus(taskStatus); state == podtask.StateUnknown { if taskState != mesos.TaskState_TASK_FINISHED { //TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED? //I don't want to reincarnate then.. TASK_LOST is a special case because //the master is stateless and there are scenarios where I may get TASK_LOST //followed by TASK_RUNNING. //TODO(jdef) consider running this asynchronously since there are API server //calls that may be made k.reconcileNonTerminalTask(driver, taskStatus) } // else, we don't really care about FINISHED tasks that aren't registered return } if _, exists := k.slaves.getSlave(taskStatus.GetSlaveId().GetValue()); !exists { // a registered task has an update reported by a slave that we don't recognize. // this should never happen! So we don't reconcile it. log.Errorf("Ignore status %+v because the slave does not exist", taskStatus) return } case mesos.TaskState_TASK_FAILED: if task, _ := k.taskRegistry.UpdateStatus(taskStatus); task != nil { if task.Has(podtask.Launched) && !task.Has(podtask.Bound) { go k.plugin.reconcilePod(task.Pod) return } } else { // unknown task failed, not much we can do about it return } // last-ditch effort to reconcile our records fallthrough case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED: k.reconcileTerminalTask(driver, taskStatus) } }
func (ctx *RunOnceApplicationContext) updateTaskState(status *mesos.TaskStatus) { for _, task := range ctx.tasks { if task.TaskID == status.GetTaskId().GetValue() { task.State = status.GetState() return } } framework.Logger.Warn("Got unexpected status update for unknown task with ID %s", status.GetTaskId().GetValue()) }
func (sched *Scheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { if glog.V(1) { glog.Infof("status update from task %s in state %s under executor %s on slave %s: %s", status.GetTaskId().GetValue(), status.GetState(), status.GetExecutorId().GetValue(), status.GetSlaveId().GetValue(), status.GetMessage(), ) } }
// StatusUpdate handles status updates messages received from Mesos master // // Currently this method only logs status updates. This might change in the future func (bw *BasicWorker) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) { taskId := status.TaskId.GetValue() taskStatus := status.GetState() log.Println("Task", taskId, "is in state", taskStatus.String()) switch taskStatus { case mesos.TaskState_TASK_RUNNING: log.Printf("Marking task %s as %s", taskId, taurus.RUNNING) case mesos.TaskState_TASK_KILLED, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_LOST: log.Printf("Marking task %s as %s", taskId, taurus.STOPPED) } }
func statusString(status *mesos.TaskStatus) string { s := fmt.Sprintf("%s %s slave: %s", status.GetTaskId().GetValue(), status.GetState().String(), idString(status.GetSlaveId().GetValue())) if status.GetState() != mesos.TaskState_TASK_RUNNING { s += " reason: " + status.GetReason().String() } if status.GetMessage() != "" { s += " message: " + status.GetMessage() } return s }
func (sched *SdcScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ // KillTaskを実行するとTASK_LOSTが検知され、フレームワークが止まる // driver.KillTask(status.TaskId) // log.Infoln("!! Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) // return } if sched.tasksFinished >= sched.totalTasks { // log.Infoln("Total tasks completed, stopping framework.") log.Infoln("Total tasks completed.") sched.tasksFinished = 0 sched.totalTasks = 0 sched.tasksLaunched = 0 // driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } }
func (sched *MesosRunonceScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.V(1).Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) eventCh <- status if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ } if sched.tasksFinished >= sched.totalTasks { log.V(1).Infoln("Total tasks completed, stopping framework.") driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { exitStatus = 1 log.Warningf("mesos TaskStatus: %v", status) driver.Stop(false) log.Errorln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message.", status.GetMessage(), ) } }
func testErrorStatus(t *testing.T, ctx *RunOnceApplicationContext, driver *mesostest.MockSchedulerDriver, status *mesos.TaskStatus) { appDone := ctx.StatusUpdate(driver, status) So(appDone, ShouldBeTrue) select { case sts := <-ctx.StatusChan: So(sts.Error, ShouldNotBeNil) So(sts.Error.Error(), ShouldContainSubstring, "failed to run on host") default: t.Fail() } So(ctx.tasks[0].State, ShouldEqual, status.GetState()) }
func (s *StackDeployScheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) { Logger.Info("[StatusUpdate] %s", pretty.Status(status)) if status.GetState() == mesos.TaskState_TASK_FINISHED { driver.ReviveOffers() } for _, runner := range MesosTaskRunners { if runner.StatusUpdate(driver, status) { return } } Logger.Warn("Received status update that was not handled by any Mesos Task Runner: %s", pretty.Status(status)) }
func (s *Scheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) { Logger.Infof("[StatusUpdate] %s", pretty.Status(status)) id := s.idFromTaskId(status.GetTaskId().GetValue()) switch status.GetState() { case mesos.TaskState_TASK_RUNNING: s.onTaskStarted(id, status) case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR: s.onTaskFailed(id, status) case mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_KILLED: s.onTaskFinished(id, status) default: Logger.Warnf("Got unexpected task state %s for task %s", pretty.Status(status), id) } s.cluster.Save() }
func (s *Scheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) { Logger.Infof("[StatusUpdate] %s", statusString(status)) slave := s.slaveFromTaskId(status.GetTaskId().GetValue()) if status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_ERROR || status.GetState() == mesos.TaskState_TASK_FINISHED { s.cluster.Remove(slave) } }
func (k *framework) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) { task, state := k.sched.Tasks().UpdateStatus(taskStatus) if (state == podtask.StateRunning || state == podtask.StatePending) && ((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) || (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) || (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) || (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared) || (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.KubeletPodLaunchFailed) || (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.TaskKilled && !task.Has(podtask.Deleted))) { //-- // pod-task has metadata that refers to: // (1) a task that Mesos no longer knows about, or else // (2) a pod that the Kubelet will never report as "failed" // (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart) // (4) a pod that the kubeletExecutor reported as lost because the kubelet didn't manage to launch it (in time) // (5) a pod that the kubeletExecutor killed, but the scheduler didn't ask for that (maybe killed by the master) // For now, destroy the pod and hope that there's a replication controller backing it up. // TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed pod := &task.Pod log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID) if err := k.client.Core().Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil && !errors.IsNotFound(err) { log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err) } } else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED { // attempt to prevent dangling pods in the pod and task registries log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue()) k.tasksReconciler.RequestExplicit() } else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil { //TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection //If we're reconciling and receive this then the executor may be //running a task that we need it to kill. It's possible that the framework //is unrecognized by the master at this point, so KillTask is not guaranteed //to do anything. The underlying driver transport may be able to send a //FrameworkMessage directly to the slave to terminate the task. log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId) data := fmt.Sprintf("%s:%s", messages.TaskLost, task.ID) //TODO(jdef) use a real message type if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil { log.Error(err.Error()) } } }
func Status(status *mesos.TaskStatus) string { var buffer bytes.Buffer buffer.WriteString(fmt.Sprintf("%s %s", status.GetTaskId().GetValue(), status.GetState().String())) if status.GetSlaveId() != nil && status.GetSlaveId().GetValue() != "" { buffer.WriteString(" slave: ") buffer.WriteString(ID(status.GetSlaveId().GetValue())) } if status.GetState() != mesos.TaskState_TASK_RUNNING { buffer.WriteString(" reason: ") buffer.WriteString(status.GetReason().String()) } if status.GetMessage() != "" { buffer.WriteString(" message: ") buffer.WriteString(status.GetMessage()) } return buffer.String() }
// SendStatusUpdate sends status updates to the slave. func (driver *MesosExecutorDriver) SendStatusUpdate(taskStatus *mesosproto.TaskStatus) (mesosproto.Status, error) { log.V(3).Infoln("Sending task status update: ", taskStatus.String()) if stat := driver.Status(); stat != mesosproto.Status_DRIVER_RUNNING { return stat, fmt.Errorf("Unable to SendStatusUpdate, expecting driver.status %s, but got %s", mesosproto.Status_DRIVER_RUNNING, stat) } if taskStatus.GetState() == mesosproto.TaskState_TASK_STAGING { err := fmt.Errorf("Executor is not allowed to send TASK_STAGING status update. Aborting!") log.Errorln(err) if err0 := driver.stop(mesosproto.Status_DRIVER_ABORTED); err0 != nil { log.Errorln("Error while stopping the driver", err0) } return driver.Status(), err } // Set up status update. update := driver.makeStatusUpdate(taskStatus) log.Infof("Executor sending status update %v\n", update.String()) // Capture the status update. driver.lock.Lock() driver.updates[uuid.UUID(update.GetUuid()).String()] = update driver.lock.Unlock() // Put the status update in the message. message := &mesosproto.StatusUpdateMessage{ Update: update, Pid: proto.String(driver.self.String()), } // Send the message. if err := driver.send(driver.slaveUPID, message); err != nil { log.Errorf("Failed to send %v: %v\n", message, err) return driver.status, err } return driver.Status(), nil }
// mesos.Scheduler interface method. // Invoked when the status of a task has changed. func (this *ElodinaTransportScheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) { if *status.GetState().Enum() == mesos.TaskState_TASK_RUNNING { this.taskIdToTaskState[*status.TaskId.Value].pending = true } else if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_FINISHED { this.TakenTopicPartitions.RemoveAll(this.taskIdToTaskState[*status.TaskId.Value].GetAssignment()) delete(this.taskIdToTaskState, *status.TaskId.Value) } }
// mesos.Scheduler interface method. // Invoked when the status of a task has changed. func (this *ElodinaTransportScheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) { log.Logger.Info("Received status %s for task %s", status.GetState().Enum(), status.TaskId.GetValue()) if status.GetState() == mesos.TaskState_TASK_RUNNING { this.taskIdToTaskState[status.TaskId.GetValue()].pending = true } else if isTerminated(status.GetState()) { this.TakenTopicPartitions.RemoveAll(this.taskIdToTaskState[status.TaskId.GetValue()].GetAssignment()) delete(this.taskIdToTaskState, status.TaskId.GetValue()) } }
// Will obtain the log file you desire from the mesos-agent and react to the status of the message accordingly. func FetchLogs(status *mesos.TaskStatus, offset int, file string, frameworkId string) ([]byte, error) { var ( dir string hostname string err error ) switch status.GetState() { case mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_KILLED: hostDir, err := hostDirFromState(status, frameworkId) if err != nil { return nil, err } hostname, dir = hostDir.Host, hostDir.Dir default: hostDir, err := hostDirFromTaskStatus(status) if err != nil { return nil, err } hostname, dir = hostDir.Host, hostDir.Dir } url := fmt.Sprintf("http://%s:5051/files/read.json?path=%s/%s&offset=%d", hostname, dir, file, offset) bodyData, err := fetchUrl(url) if err != nil { return nil, err } var logData LogData err = json.Unmarshal(bodyData, &logData) if err != nil { return nil, err } return []byte(logData.Data), nil }
func (ctx *RunOnceApplicationContext) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) bool { ctx.lock.Lock() defer ctx.lock.Unlock() hostname := hostnameFromTaskID(status.GetTaskId().GetValue()) ctx.updateTaskState(status) switch status.GetState() { case mesos.TaskState_TASK_RUNNING: log.Infof("Task %s received status update in state %s", status.GetTaskId().GetValue(), status.GetState().String()) case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR: //TODO also kill all other running tasks sometime? ctx.StatusChan <- framework.NewApplicationRunStatus(ctx.Application, fmt.Errorf("Application %s failed to run on host %s with status %s: %s", ctx.Application.ID, hostname, status.GetState().String(), status.GetMessage())) return true case mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_KILLED: if ctx.allTasksFinished() { ctx.StatusChan <- framework.NewApplicationRunStatus(ctx.Application, nil) return true } default: log.Warningf("Got unexpected task state %s", pretty.Status(status)) } return false }
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ driver.ReviveOffers() // TODO(jdef) rate-limit this } if sched.tasksFinished >= sched.totalTasks { log.Infoln("Total tasks completed, stopping framework.") driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { sched.tasksErrored++ } }
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ } if sched.tasksFinished >= sched.totalTasks { log.Infoln("Total tasks completed, stopping framework.") driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } }
func (sched *ScraperScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ log.Infof("%v of %v tasks finished.", sched.tasksFinished, sched.totalTasks) uri := string(status.Data) db, err := sql.Open("sqlite3", "./database.db") if err != nil { log.Fatal("Failed to connect to database: %v\n", err) } defer db.Close() tx, err := db.Begin() if err != nil { log.Fatal(err) } stmt, err := tx.Prepare("insert into runs(uri, storage_path, last_scrape_time) values(?, ?, ?)") if err != nil { log.Fatal(err) } path := base64.StdEncoding.EncodeToString([]byte(uri)) _, err = stmt.Exec(uri, path, time.Now().Unix()) if err != nil { log.Fatal(err) } defer stmt.Close() tx.Commit() } if sched.tasksFinished >= sched.totalTasks { log.Infoln("Tasks that we know about are done!") } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } }
// reconcile an unknown (from the perspective of our registry) non-terminal task func (k *KubernetesScheduler) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) { // attempt to recover task from pod info: // - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil // - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace // - pull the pod metadata down from the api server // - perform task recovery based on pod metadata taskId := taskStatus.TaskId.GetValue() if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER { // there will be no data in the task status that we can use to determine the associated pod switch taskStatus.GetState() { case mesos.TaskState_TASK_STAGING: // there is still hope for this task, don't kill it just yet //TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state return default: // for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for. // if the scheduler failed over before the executor fired TASK_STARTING, then we should *not* // be processing this reconciliation update before we process the one from the executor. // point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod), // so it gets killed. log.Errorf("killing non-terminal, unrecoverable task %v", taskId) } } else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil { // possible rogue pod exists at this point because we can't identify it; should kill the task log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err) } else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil { // possible rogue pod exists at this point because we can't identify it; should kill the task log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v", podStatus.Name, taskId, err) } else if pod, err := k.client.Pods(namespace).Get(name); err == nil { if t, ok, err := podtask.RecoverFrom(*pod); ok { log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name) _, err := k.taskRegistry.Register(t, nil) if err != nil { // someone beat us to it?! log.Warningf("failed to register recovered task: %v", err) return } else { k.taskRegistry.UpdateStatus(taskStatus) } return } else if err != nil { //should kill the pod and the task log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err) if err := k.client.Pods(namespace).Delete(name, nil); err != nil { log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err) } } else { //this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod //metadata is not appropriate for task reconstruction -- which should almost certainly never //be the case unless someone swapped out the pod on us (and kept the same namespace/name) while //we were failed over. //kill this task, allow the newly launched scheduler to schedule the new pod log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod) } } else if errors.IsNotFound(err) { // pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name) } else if errors.IsServerTimeout(err) { log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err) return } else { log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err) return } if _, err := driver.KillTask(taskStatus.TaskId); err != nil { log.Errorf("failed to kill task %v: %v", taskId, err) } }
func (s *EtcdScheduler) StatusUpdate( driver scheduler.SchedulerDriver, status *mesos.TaskStatus, ) { s.mut.Lock() defer s.mut.Unlock() log.Infoln( "Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String(), ) node, err := config.Parse(status.GetTaskId().GetValue()) if err != nil { log.Errorf("scheduler: failed to unmarshal config.Node from TaskId: %s", err) return } node.SlaveID = status.SlaveId.GetValue() // record that we've heard about this task s.heardFrom[status.GetTaskId().GetValue()] = struct{}{} switch status.GetState() { case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_KILLED, mesos.TaskState_TASK_ERROR, mesos.TaskState_TASK_FAILED: log.Errorf("Task contraction: %+v", status.GetState()) log.Errorf("message: %s", status.GetMessage()) log.Errorf("reason: %+v", status.GetReason()) atomic.AddUint32(&s.Stats.FailedServers, 1) // TODO(tyler) kill this // Pump the brakes so that we have time to deconfigure the lost node // before adding a new one. If we don't deconfigure first, we risk // split brain. s.PumpTheBrakes() // now we know this task is dead delete(s.pending, node.Name) delete(s.running, node.Name) delete(s.tasks, node.Name) // We don't have to clean up the state in ZK for this // as it is fine to eventually just persist when we // receive a new TASK_RUNNING below. delete(s.reconciliationInfo, status.TaskId.GetValue()) s.QueueLaunchAttempt() // TODO(tyler) do we want to lock if the first task fails? // TODO(tyler) can we handle a total loss at reconciliation time, // when s.state == Immutable? if len(s.running) == 0 && s.state == Mutable { log.Error("TOTAL CLUSTER LOSS! LOCKING SCHEDULER, " + "FOLLOW RESTORATION GUIDE AT " + "https://github.com/mesosphere/" + "etcd-mesos/blob/master/docs/response.md") s.state = Immutable } case mesos.TaskState_TASK_STARTING: case mesos.TaskState_TASK_RUNNING: // We update data to ZK synchronously because it must happen // in-order. If we spun off a goroutine this would possibly retry // and succeed in the wrong order, and older data would win. // We keep this simple here, as if ZK is healthy this won't take long. // If this takes long, we're probably about to die anyway, as ZK is // displeased and mesos-go will panic when it loses contact. s.reconciliationInfo[status.TaskId.GetValue()] = status.SlaveId.GetValue() err = s.updateReconciliationInfoFunc( s.reconciliationInfo, s.ZkServers, s.ZkChroot, s.FrameworkName, ) if err != nil { log.Errorf("Failed to persist reconciliation info: %+v", err) } delete(s.pending, node.Name) _, present := s.running[node.Name] if !present { s.running[node.Name] = node s.tasks[node.Name] = status.TaskId } // During reconcilliation, we may find nodes with higher ID's due to ntp drift etcdIndexParts := strings.Split(node.Name, "-") if len(etcdIndexParts) != 2 { log.Warning("Task has a Name that does not follow the form etcd-<index>") } else { etcdIndex, err := strconv.ParseInt(etcdIndexParts[1], 10, 64) if err != nil { log.Warning("Task has a Name that does not follow the form etcd-<index>") } else { if etcdIndex > s.highestInstanceID { s.highestInstanceID = etcdIndex + 1 } } } default: log.Warningf("Received unhandled task state: %+v", status.GetState()) } }
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) for _, task := range sched.tasks { if task.name == status.TaskId.GetValue() && (status.GetState() == mesos.TaskState_TASK_FINISHED || status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR) { // No matter what the outcome was, move to finished state so that we can unreserve resources task.state = FinishedState } } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { log.Infoln( "Task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ". Unreserving resources", ) } }