func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) for _, task := range sched.tasks { if task.name == status.TaskId.GetValue() && (status.GetState() == mesos.TaskState_TASK_FINISHED || status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR) { // No matter what the outcome was, move to finished state so that we can unreserve resources task.state = FinishedState } } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { log.Infoln( "Task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ". Unreserving resources", ) } }
func (sched *SdcScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ // KillTaskを実行するとTASK_LOSTが検知され、フレームワークが止まる // driver.KillTask(status.TaskId) // log.Infoln("!! Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) // return } if sched.tasksFinished >= sched.totalTasks { // log.Infoln("Total tasks completed, stopping framework.") log.Infoln("Total tasks completed.") sched.tasksFinished = 0 sched.totalTasks = 0 sched.tasksLaunched = 0 // driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } }
func (sched *MesosRunonceScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.V(1).Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) eventCh <- status if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ } if sched.tasksFinished >= sched.totalTasks { log.V(1).Infoln("Total tasks completed, stopping framework.") driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { exitStatus = 1 log.Warningf("mesos TaskStatus: %v", status) driver.Stop(false) log.Errorln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message.", status.GetMessage(), ) } }
func (sched *Scheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { if glog.V(1) { glog.Infof("status update from task %s in state %s under executor %s on slave %s: %s", status.GetTaskId().GetValue(), status.GetState(), status.GetExecutorId().GetValue(), status.GetSlaveId().GetValue(), status.GetMessage(), ) } }
func statusString(status *mesos.TaskStatus) string { s := fmt.Sprintf("%s %s slave: %s", status.GetTaskId().GetValue(), status.GetState().String(), idString(status.GetSlaveId().GetValue())) if status.GetState() != mesos.TaskState_TASK_RUNNING { s += " reason: " + status.GetReason().String() } if status.GetMessage() != "" { s += " message: " + status.GetMessage() } return s }
func (sched *ScraperScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ log.Infof("%v of %v tasks finished.", sched.tasksFinished, sched.totalTasks) uri := string(status.Data) db, err := sql.Open("sqlite3", "./database.db") if err != nil { log.Fatal("Failed to connect to database: %v\n", err) } defer db.Close() tx, err := db.Begin() if err != nil { log.Fatal(err) } stmt, err := tx.Prepare("insert into runs(uri, storage_path, last_scrape_time) values(?, ?, ?)") if err != nil { log.Fatal(err) } path := base64.StdEncoding.EncodeToString([]byte(uri)) _, err = stmt.Exec(uri, path, time.Now().Unix()) if err != nil { log.Fatal(err) } defer stmt.Close() tx.Commit() } if sched.tasksFinished >= sched.totalTasks { log.Infoln("Tasks that we know about are done!") } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } }
func (k *framework) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) { task, state := k.sched.Tasks().UpdateStatus(taskStatus) if (state == podtask.StateRunning || state == podtask.StatePending) && ((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) || (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) || (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) || (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared) || (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.KubeletPodLaunchFailed) || (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.TaskKilled && !task.Has(podtask.Deleted))) { //-- // pod-task has metadata that refers to: // (1) a task that Mesos no longer knows about, or else // (2) a pod that the Kubelet will never report as "failed" // (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart) // (4) a pod that the kubeletExecutor reported as lost because the kubelet didn't manage to launch it (in time) // (5) a pod that the kubeletExecutor killed, but the scheduler didn't ask for that (maybe killed by the master) // For now, destroy the pod and hope that there's a replication controller backing it up. // TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed pod := &task.Pod log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID) if err := k.client.Core().Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil && !errors.IsNotFound(err) { log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err) } } else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED { // attempt to prevent dangling pods in the pod and task registries log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue()) k.tasksReconciler.RequestExplicit() } else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil { //TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection //If we're reconciling and receive this then the executor may be //running a task that we need it to kill. It's possible that the framework //is unrecognized by the master at this point, so KillTask is not guaranteed //to do anything. The underlying driver transport may be able to send a //FrameworkMessage directly to the slave to terminate the task. log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId) data := fmt.Sprintf("%s:%s", messages.TaskLost, task.ID) //TODO(jdef) use a real message type if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil { log.Error(err.Error()) } } }
func Status(status *mesos.TaskStatus) string { var buffer bytes.Buffer buffer.WriteString(fmt.Sprintf("%s %s", status.GetTaskId().GetValue(), status.GetState().String())) if status.GetSlaveId() != nil && status.GetSlaveId().GetValue() != "" { buffer.WriteString(" slave: ") buffer.WriteString(ID(status.GetSlaveId().GetValue())) } if status.GetState() != mesos.TaskState_TASK_RUNNING { buffer.WriteString(" reason: ") buffer.WriteString(status.GetReason().String()) } if status.GetMessage() != "" { buffer.WriteString(" message: ") buffer.WriteString(status.GetMessage()) } return buffer.String() }
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ } if sched.tasksFinished >= sched.totalTasks { log.Infoln("Total tasks completed, stopping framework.") driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } }
func (s *EtcdScheduler) StatusUpdate( driver scheduler.SchedulerDriver, status *mesos.TaskStatus, ) { s.mut.Lock() defer s.mut.Unlock() log.Infoln( "Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String(), ) node, err := config.Parse(status.GetTaskId().GetValue()) if err != nil { log.Errorf("scheduler: failed to unmarshal config.Node from TaskId: %s", err) return } node.SlaveID = status.SlaveId.GetValue() // record that we've heard about this task s.heardFrom[status.GetTaskId().GetValue()] = struct{}{} switch status.GetState() { case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_KILLED, mesos.TaskState_TASK_ERROR, mesos.TaskState_TASK_FAILED: log.Errorf("Task contraction: %+v", status.GetState()) log.Errorf("message: %s", status.GetMessage()) log.Errorf("reason: %+v", status.GetReason()) atomic.AddUint32(&s.Stats.FailedServers, 1) // TODO(tyler) kill this // Pump the brakes so that we have time to deconfigure the lost node // before adding a new one. If we don't deconfigure first, we risk // split brain. s.PumpTheBrakes() // now we know this task is dead delete(s.pending, node.Name) delete(s.running, node.Name) delete(s.tasks, node.Name) // We don't have to clean up the state in ZK for this // as it is fine to eventually just persist when we // receive a new TASK_RUNNING below. delete(s.reconciliationInfo, status.TaskId.GetValue()) s.QueueLaunchAttempt() // TODO(tyler) do we want to lock if the first task fails? // TODO(tyler) can we handle a total loss at reconciliation time, // when s.state == Immutable? if len(s.running) == 0 && s.state == Mutable { log.Error("TOTAL CLUSTER LOSS! LOCKING SCHEDULER, " + "FOLLOW RESTORATION GUIDE AT " + "https://github.com/mesosphere/" + "etcd-mesos/blob/master/docs/response.md") s.state = Immutable } case mesos.TaskState_TASK_STARTING: case mesos.TaskState_TASK_RUNNING: // We update data to ZK synchronously because it must happen // in-order. If we spun off a goroutine this would possibly retry // and succeed in the wrong order, and older data would win. // We keep this simple here, as if ZK is healthy this won't take long. // If this takes long, we're probably about to die anyway, as ZK is // displeased and mesos-go will panic when it loses contact. s.reconciliationInfo[status.TaskId.GetValue()] = status.SlaveId.GetValue() err = s.updateReconciliationInfoFunc( s.reconciliationInfo, s.ZkServers, s.ZkChroot, s.FrameworkName, ) if err != nil { log.Errorf("Failed to persist reconciliation info: %+v", err) } delete(s.pending, node.Name) _, present := s.running[node.Name] if !present { s.running[node.Name] = node s.tasks[node.Name] = status.TaskId } // During reconcilliation, we may find nodes with higher ID's due to ntp drift etcdIndexParts := strings.Split(node.Name, "-") if len(etcdIndexParts) != 2 { log.Warning("Task has a Name that does not follow the form etcd-<index>") } else { etcdIndex, err := strconv.ParseInt(etcdIndexParts[1], 10, 64) if err != nil { log.Warning("Task has a Name that does not follow the form etcd-<index>") } else { if etcdIndex > s.highestInstanceID { s.highestInstanceID = etcdIndex + 1 } } } default: log.Warningf("Received unhandled task state: %+v", status.GetState()) } }
func (ctx *RunOnceApplicationContext) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) bool { ctx.lock.Lock() defer ctx.lock.Unlock() hostname := hostnameFromTaskID(status.GetTaskId().GetValue()) ctx.updateTaskState(status) switch status.GetState() { case mesos.TaskState_TASK_RUNNING: log.Infof("Task %s received status update in state %s", status.GetTaskId().GetValue(), status.GetState().String()) case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR: //TODO also kill all other running tasks sometime? ctx.StatusChan <- framework.NewApplicationRunStatus(ctx.Application, fmt.Errorf("Application %s failed to run on host %s with status %s: %s", ctx.Application.ID, hostname, status.GetState().String(), status.GetMessage())) return true case mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_KILLED: if ctx.allTasksFinished() { ctx.StatusChan <- framework.NewApplicationRunStatus(ctx.Application, nil) return true } default: log.Warningf("Got unexpected task state %s", pretty.Status(status)) } return false }