Exemple #1
0
func (k *inMemoryRegistry) UpdateStatus(status *mesos.TaskStatus) (*T, StateType) {
	taskId := status.GetTaskId().GetValue()

	k.rw.Lock()
	defer k.rw.Unlock()
	task, state := k._get(taskId)

	switch status.GetState() {
	case mesos.TaskState_TASK_STAGING:
		k.handleTaskStaging(task, state, status)
	case mesos.TaskState_TASK_STARTING:
		k.handleTaskStarting(task, state, status)
	case mesos.TaskState_TASK_RUNNING:
		k.handleTaskRunning(task, state, status)
	case mesos.TaskState_TASK_FINISHED:
		k.handleTaskFinished(task, state, status)
	case mesos.TaskState_TASK_FAILED:
		k.handleTaskFailed(task, state, status)
	case mesos.TaskState_TASK_ERROR:
		k.handleTaskError(task, state, status)
	case mesos.TaskState_TASK_KILLED:
		k.handleTaskKilled(task, state, status)
	case mesos.TaskState_TASK_LOST:
		k.handleTaskLost(task, state, status)
	default:
		log.Warningf("unhandled status update for task: %v", taskId)
	}
	return task.Clone(), state
}
Exemple #2
0
func (sched *NoneScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	taskId := status.GetTaskId().GetValue()
	log.Infoln("Status update: task", taskId, "is in state", status.State.Enum().String())

	c := sched.queue.GetCommandById(taskId)
	if c == nil {
		log.Errorln("Unable to find command for task", taskId)
		driver.Abort()
	}
	if c.Status.GetState() == status.GetState() {
		// ignore repeated status updates
		return
	}
	c.Status = status

	// send status update to CommandHandler
	if status.GetState() == mesos.TaskState_TASK_RUNNING {
		sched.handler.CommandRunning(c)
	} else if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.handler.CommandEnded(c)
		sched.handler.CommandFinished(c)
	} else if status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED {
		sched.handler.CommandEnded(c)
		sched.handler.CommandFailed(c)
	}

	// stop if Commands channel was closed and all tasks are finished
	if sched.queue.Closed() && !sched.handler.HasRunningTasks() {
		log.Infoln("All tasks finished, stopping framework.")
		sched.handler.FinishAllCommands()
		driver.Stop(false)
	}
}
Exemple #3
0
func (sched *Scheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())

	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
		log.Infoln("%v of %v tasks finished.", sched.tasksFinished, sched.totalTasks)
	}

	//TODO if a job is finished, failed, error, lost, killed
	// figure out how this impacts dependent jobs and update job graph

	/*
		  //never shut down framework!
			if sched.tasksFinished >= sched.totalTasks {
				log.Infoln("Total tasks completed, stopping framework.")
				driver.Stop(false)
			}
	*/

	/*
		if status.GetState() == mesos.TaskState_TASK_LOST ||
			status.GetState() == mesos.TaskState_TASK_KILLED ||
			status.GetState() == mesos.TaskState_TASK_FAILED {
			log.Infoln(
				"Aborting because task", status.TaskId.GetValue(),
				"is in unexpected state", status.State.String(),
				"with message", status.GetMessage(),
			)
			driver.Abort()
		}
	*/
}
Exemple #4
0
// mesos.Scheduler interface method.
// Invoked when the status of a task has changed.
func (this *TransformScheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) {
	fmt.Printf("Status update: task %s is in state %s\n", status.TaskId.GetValue(), status.State.Enum().String())

	if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_FINISHED {
		this.removeTask(status.GetTaskId())
		this.decRunningInstances()
	}
}
func (sched *Scheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())

	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
		log.Infoln("%v of %v tasks finished.", sched.tasksFinished, sched.totalTasks)
	}
}
Exemple #6
0
// StatusUpdate is called when a status update message is sent to the scheduler.
func (k *KubernetesScheduler) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {

	source, reason := "none", "none"
	if taskStatus.Source != nil {
		source = (*taskStatus.Source).String()
	}
	if taskStatus.Reason != nil {
		reason = (*taskStatus.Reason).String()
	}
	taskState := taskStatus.GetState()
	metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc()

	log.Infof(
		"task status update %q from %q for task %q on slave %q executor %q for reason %q",
		taskState.String(),
		source,
		taskStatus.TaskId.GetValue(),
		taskStatus.SlaveId.GetValue(),
		taskStatus.ExecutorId.GetValue(),
		reason)

	switch taskState {
	case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING:
		if _, state := k.taskRegistry.UpdateStatus(taskStatus); state == podtask.StateUnknown {
			if taskState != mesos.TaskState_TASK_FINISHED {
				//TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED?
				//I don't want to reincarnate then..  TASK_LOST is a special case because
				//the master is stateless and there are scenarios where I may get TASK_LOST
				//followed by TASK_RUNNING.
				//TODO(jdef) consider running this asynchronously since there are API server
				//calls that may be made
				k.reconcileNonTerminalTask(driver, taskStatus)
			} // else, we don't really care about FINISHED tasks that aren't registered
			return
		}
		if _, exists := k.slaves.getSlave(taskStatus.GetSlaveId().GetValue()); !exists {
			// a registered task has an update reported by a slave that we don't recognize.
			// this should never happen! So we don't reconcile it.
			log.Errorf("Ignore status %+v because the slave does not exist", taskStatus)
			return
		}
	case mesos.TaskState_TASK_FAILED:
		if task, _ := k.taskRegistry.UpdateStatus(taskStatus); task != nil {
			if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
				go k.plugin.reconcilePod(task.Pod)
				return
			}
		} else {
			// unknown task failed, not much we can do about it
			return
		}
		// last-ditch effort to reconcile our records
		fallthrough
	case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED:
		k.reconcileTerminalTask(driver, taskStatus)
	}
}
func (ctx *RunOnceApplicationContext) updateTaskState(status *mesos.TaskStatus) {
	for _, task := range ctx.tasks {
		if task.TaskID == status.GetTaskId().GetValue() {
			task.State = status.GetState()
			return
		}
	}

	framework.Logger.Warn("Got unexpected status update for unknown task with ID %s", status.GetTaskId().GetValue())
}
Exemple #8
0
func (sched *Scheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	if glog.V(1) {
		glog.Infof("status update from task %s in state %s under executor %s on slave %s: %s",
			status.GetTaskId().GetValue(),
			status.GetState(),
			status.GetExecutorId().GetValue(),
			status.GetSlaveId().GetValue(),
			status.GetMessage(),
		)
	}
}
Exemple #9
0
// StatusUpdate handles status updates messages received from Mesos master
//
// Currently this method only logs status updates. This might change in the future
func (bw *BasicWorker) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) {
	taskId := status.TaskId.GetValue()
	taskStatus := status.GetState()
	log.Println("Task", taskId, "is in state", taskStatus.String())

	switch taskStatus {
	case mesos.TaskState_TASK_RUNNING:
		log.Printf("Marking task %s as %s", taskId, taurus.RUNNING)
	case mesos.TaskState_TASK_KILLED, mesos.TaskState_TASK_FINISHED,
		mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_LOST:
		log.Printf("Marking task %s as %s", taskId, taurus.STOPPED)
	}
}
Exemple #10
0
func statusString(status *mesos.TaskStatus) string {
	s := fmt.Sprintf("%s %s slave: %s", status.GetTaskId().GetValue(), status.GetState().String(), idString(status.GetSlaveId().GetValue()))

	if status.GetState() != mesos.TaskState_TASK_RUNNING {
		s += " reason: " + status.GetReason().String()
	}

	if status.GetMessage() != "" {
		s += " message: " + status.GetMessage()
	}

	return s
}
Exemple #11
0
func (sched *SdcScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())

	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
		// KillTaskを実行するとTASK_LOSTが検知され、フレームワークが止まる
		// driver.KillTask(status.TaskId)
		// log.Infoln("!! Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
		// return
	}

	if sched.tasksFinished >= sched.totalTasks {
		// log.Infoln("Total tasks completed, stopping framework.")
		log.Infoln("Total tasks completed.")
		sched.tasksFinished = 0
		sched.totalTasks = 0
		sched.tasksLaunched = 0
		// driver.Stop(false)
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_ERROR {
		log.Infoln(
			"Aborting because task", status.TaskId.GetValue(),
			"is in unexpected state", status.State.String(),
			"with message", status.GetMessage(),
		)
		driver.Abort()
	}
}
Exemple #12
0
func (sched *MesosRunonceScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.V(1).Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
	eventCh <- status

	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
	}

	if sched.tasksFinished >= sched.totalTasks {
		log.V(1).Infoln("Total tasks completed, stopping framework.")
		driver.Stop(false)
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_ERROR {
		exitStatus = 1
		log.Warningf("mesos TaskStatus: %v", status)
		driver.Stop(false)
		log.Errorln(
			"Aborting because task", status.TaskId.GetValue(),
			"is in unexpected state", status.State.String(),
			"with message.", status.GetMessage(),
		)
	}
}
Exemple #13
0
func testErrorStatus(t *testing.T, ctx *RunOnceApplicationContext, driver *mesostest.MockSchedulerDriver, status *mesos.TaskStatus) {
	appDone := ctx.StatusUpdate(driver, status)
	So(appDone, ShouldBeTrue)

	select {
	case sts := <-ctx.StatusChan:
		So(sts.Error, ShouldNotBeNil)
		So(sts.Error.Error(), ShouldContainSubstring, "failed to run on host")
	default:
		t.Fail()
	}

	So(ctx.tasks[0].State, ShouldEqual, status.GetState())
}
Exemple #14
0
func (s *StackDeployScheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) {
	Logger.Info("[StatusUpdate] %s", pretty.Status(status))

	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		driver.ReviveOffers()
	}

	for _, runner := range MesosTaskRunners {
		if runner.StatusUpdate(driver, status) {
			return
		}
	}

	Logger.Warn("Received status update that was not handled by any Mesos Task Runner: %s", pretty.Status(status))
}
func (s *Scheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) {
	Logger.Infof("[StatusUpdate] %s", pretty.Status(status))

	id := s.idFromTaskId(status.GetTaskId().GetValue())

	switch status.GetState() {
	case mesos.TaskState_TASK_RUNNING:
		s.onTaskStarted(id, status)
	case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR:
		s.onTaskFailed(id, status)
	case mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_KILLED:
		s.onTaskFinished(id, status)
	default:
		Logger.Warnf("Got unexpected task state %s for task %s", pretty.Status(status), id)
	}

	s.cluster.Save()
}
Exemple #16
0
func (s *Scheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) {
	Logger.Infof("[StatusUpdate] %s", statusString(status))

	slave := s.slaveFromTaskId(status.GetTaskId().GetValue())

	if status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_ERROR ||
		status.GetState() == mesos.TaskState_TASK_FINISHED {
		s.cluster.Remove(slave)
	}
}
Exemple #17
0
func (k *framework) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
	task, state := k.sched.Tasks().UpdateStatus(taskStatus)

	if (state == podtask.StateRunning || state == podtask.StatePending) &&
		((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared) ||
			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.KubeletPodLaunchFailed) ||
			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.TaskKilled && !task.Has(podtask.Deleted))) {
		//--
		// pod-task has metadata that refers to:
		// (1) a task that Mesos no longer knows about, or else
		// (2) a pod that the Kubelet will never report as "failed"
		// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
		// (4) a pod that the kubeletExecutor reported as lost because the kubelet didn't manage to launch it (in time)
		// (5) a pod that the kubeletExecutor killed, but the scheduler didn't ask for that (maybe killed by the master)
		// For now, destroy the pod and hope that there's a replication controller backing it up.
		// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
		pod := &task.Pod
		log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
		if err := k.client.Core().Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil && !errors.IsNotFound(err) {
			log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
		}
	} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
		// attempt to prevent dangling pods in the pod and task registries
		log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue())
		k.tasksReconciler.RequestExplicit()
	} else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil {
		//TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection
		//If we're reconciling and receive this then the executor may be
		//running a task that we need it to kill. It's possible that the framework
		//is unrecognized by the master at this point, so KillTask is not guaranteed
		//to do anything. The underlying driver transport may be able to send a
		//FrameworkMessage directly to the slave to terminate the task.
		log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId)
		data := fmt.Sprintf("%s:%s", messages.TaskLost, task.ID) //TODO(jdef) use a real message type
		if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil {
			log.Error(err.Error())
		}
	}
}
Exemple #18
0
func Status(status *mesos.TaskStatus) string {
	var buffer bytes.Buffer
	buffer.WriteString(fmt.Sprintf("%s %s", status.GetTaskId().GetValue(), status.GetState().String()))
	if status.GetSlaveId() != nil && status.GetSlaveId().GetValue() != "" {
		buffer.WriteString(" slave: ")
		buffer.WriteString(ID(status.GetSlaveId().GetValue()))
	}

	if status.GetState() != mesos.TaskState_TASK_RUNNING {
		buffer.WriteString(" reason: ")
		buffer.WriteString(status.GetReason().String())
	}

	if status.GetMessage() != "" {
		buffer.WriteString(" message: ")
		buffer.WriteString(status.GetMessage())
	}

	return buffer.String()
}
Exemple #19
0
// SendStatusUpdate sends status updates to the slave.
func (driver *MesosExecutorDriver) SendStatusUpdate(taskStatus *mesosproto.TaskStatus) (mesosproto.Status, error) {
	log.V(3).Infoln("Sending task status update: ", taskStatus.String())

	if stat := driver.Status(); stat != mesosproto.Status_DRIVER_RUNNING {
		return stat, fmt.Errorf("Unable to SendStatusUpdate, expecting driver.status %s, but got %s", mesosproto.Status_DRIVER_RUNNING, stat)
	}

	if taskStatus.GetState() == mesosproto.TaskState_TASK_STAGING {
		err := fmt.Errorf("Executor is not allowed to send TASK_STAGING status update. Aborting!")
		log.Errorln(err)
		if err0 := driver.stop(mesosproto.Status_DRIVER_ABORTED); err0 != nil {
			log.Errorln("Error while stopping the driver", err0)
		}

		return driver.Status(), err
	}

	// Set up status update.
	update := driver.makeStatusUpdate(taskStatus)
	log.Infof("Executor sending status update %v\n", update.String())

	// Capture the status update.
	driver.lock.Lock()
	driver.updates[uuid.UUID(update.GetUuid()).String()] = update
	driver.lock.Unlock()

	// Put the status update in the message.
	message := &mesosproto.StatusUpdateMessage{
		Update: update,
		Pid:    proto.String(driver.self.String()),
	}
	// Send the message.
	if err := driver.send(driver.slaveUPID, message); err != nil {
		log.Errorf("Failed to send %v: %v\n", message, err)
		return driver.status, err
	}

	return driver.Status(), nil
}
Exemple #20
0
// mesos.Scheduler interface method.
// Invoked when the status of a task has changed.
func (this *ElodinaTransportScheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) {
	if *status.GetState().Enum() == mesos.TaskState_TASK_RUNNING {
		this.taskIdToTaskState[*status.TaskId.Value].pending = true
	} else if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_FINISHED {
		this.TakenTopicPartitions.RemoveAll(this.taskIdToTaskState[*status.TaskId.Value].GetAssignment())
		delete(this.taskIdToTaskState, *status.TaskId.Value)
	}
}
Exemple #21
0
// mesos.Scheduler interface method.
// Invoked when the status of a task has changed.
func (this *ElodinaTransportScheduler) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) {
	log.Logger.Info("Received status %s for task %s", status.GetState().Enum(), status.TaskId.GetValue())
	if status.GetState() == mesos.TaskState_TASK_RUNNING {
		this.taskIdToTaskState[status.TaskId.GetValue()].pending = true
	} else if isTerminated(status.GetState()) {
		this.TakenTopicPartitions.RemoveAll(this.taskIdToTaskState[status.TaskId.GetValue()].GetAssignment())
		delete(this.taskIdToTaskState, status.TaskId.GetValue())
	}
}
Exemple #22
0
// Will obtain the log file you desire from the mesos-agent and react to the status of the message accordingly.
func FetchLogs(status *mesos.TaskStatus, offset int, file string, frameworkId string) ([]byte, error) {
	var (
		dir      string
		hostname string
		err      error
	)
	switch status.GetState() {
	case mesos.TaskState_TASK_FAILED,
		mesos.TaskState_TASK_KILLED:
		hostDir, err := hostDirFromState(status, frameworkId)
		if err != nil {
			return nil, err
		}
		hostname, dir = hostDir.Host, hostDir.Dir
	default:
		hostDir, err := hostDirFromTaskStatus(status)
		if err != nil {
			return nil, err
		}
		hostname, dir = hostDir.Host, hostDir.Dir
	}

	url := fmt.Sprintf("http://%s:5051/files/read.json?path=%s/%s&offset=%d",
		hostname, dir, file, offset)
	bodyData, err := fetchUrl(url)
	if err != nil {
		return nil, err
	}

	var logData LogData
	err = json.Unmarshal(bodyData, &logData)
	if err != nil {
		return nil, err
	}
	return []byte(logData.Data), nil
}
func (ctx *RunOnceApplicationContext) StatusUpdate(driver scheduler.SchedulerDriver, status *mesos.TaskStatus) bool {
	ctx.lock.Lock()
	defer ctx.lock.Unlock()

	hostname := hostnameFromTaskID(status.GetTaskId().GetValue())
	ctx.updateTaskState(status)
	switch status.GetState() {
	case mesos.TaskState_TASK_RUNNING:
		log.Infof("Task %s received status update in state %s", status.GetTaskId().GetValue(), status.GetState().String())
	case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR:
		//TODO also kill all other running tasks sometime?
		ctx.StatusChan <- framework.NewApplicationRunStatus(ctx.Application, fmt.Errorf("Application %s failed to run on host %s with status %s: %s", ctx.Application.ID, hostname, status.GetState().String(), status.GetMessage()))
		return true
	case mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_KILLED:
		if ctx.allTasksFinished() {
			ctx.StatusChan <- framework.NewApplicationRunStatus(ctx.Application, nil)
			return true
		}
	default:
		log.Warningf("Got unexpected task state %s", pretty.Status(status))
	}

	return false
}
Exemple #24
0
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
		driver.ReviveOffers() // TODO(jdef) rate-limit this
	}

	if sched.tasksFinished >= sched.totalTasks {
		log.Infoln("Total tasks completed, stopping framework.")
		driver.Stop(false)
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_ERROR {
		sched.tasksErrored++
	}
}
Exemple #25
0
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
	}

	if sched.tasksFinished >= sched.totalTasks {
		log.Infoln("Total tasks completed, stopping framework.")
		driver.Stop(false)
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_ERROR {
		log.Infoln(
			"Aborting because task", status.TaskId.GetValue(),
			"is in unexpected state", status.State.String(),
			"with message", status.GetMessage(),
		)
		driver.Abort()
	}
}
Exemple #26
0
func (sched *ScraperScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())

	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
		log.Infof("%v of %v tasks finished.", sched.tasksFinished, sched.totalTasks)
		uri := string(status.Data)

		db, err := sql.Open("sqlite3", "./database.db")
		if err != nil {
			log.Fatal("Failed to connect to database: %v\n", err)
		}
		defer db.Close()
		tx, err := db.Begin()
		if err != nil {
			log.Fatal(err)
		}
		stmt, err := tx.Prepare("insert into runs(uri, storage_path, last_scrape_time) values(?, ?, ?)")
		if err != nil {
			log.Fatal(err)
		}

		path := base64.StdEncoding.EncodeToString([]byte(uri))
		_, err = stmt.Exec(uri, path, time.Now().Unix())
		if err != nil {
			log.Fatal(err)
		}
		defer stmt.Close()
		tx.Commit()
	}

	if sched.tasksFinished >= sched.totalTasks {
		log.Infoln("Tasks that we know about are done!")
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED {
		log.Infoln(
			"Aborting because task", status.TaskId.GetValue(),
			"is in unexpected state", status.State.String(),
			"with message", status.GetMessage(),
		)
		driver.Abort()
	}
}
Exemple #27
0
// reconcile an unknown (from the perspective of our registry) non-terminal task
func (k *KubernetesScheduler) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
	// attempt to recover task from pod info:
	// - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil
	// - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace
	// - pull the pod metadata down from the api server
	// - perform task recovery based on pod metadata
	taskId := taskStatus.TaskId.GetValue()
	if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
		// there will be no data in the task status that we can use to determine the associated pod
		switch taskStatus.GetState() {
		case mesos.TaskState_TASK_STAGING:
			// there is still hope for this task, don't kill it just yet
			//TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state
			return
		default:
			// for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for.
			// if the scheduler failed over before the executor fired TASK_STARTING, then we should *not*
			// be processing this reconciliation update before we process the one from the executor.
			// point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod),
			// so it gets killed.
			log.Errorf("killing non-terminal, unrecoverable task %v", taskId)
		}
	} else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil {
		// possible rogue pod exists at this point because we can't identify it; should kill the task
		log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err)
	} else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil {
		// possible rogue pod exists at this point because we can't identify it; should kill the task
		log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v",
			podStatus.Name, taskId, err)
	} else if pod, err := k.client.Pods(namespace).Get(name); err == nil {
		if t, ok, err := podtask.RecoverFrom(*pod); ok {
			log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name)
			_, err := k.taskRegistry.Register(t, nil)
			if err != nil {
				// someone beat us to it?!
				log.Warningf("failed to register recovered task: %v", err)
				return
			} else {
				k.taskRegistry.UpdateStatus(taskStatus)
			}
			return
		} else if err != nil {
			//should kill the pod and the task
			log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err)
			if err := k.client.Pods(namespace).Delete(name, nil); err != nil {
				log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err)
			}
		} else {
			//this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod
			//metadata is not appropriate for task reconstruction -- which should almost certainly never
			//be the case unless someone swapped out the pod on us (and kept the same namespace/name) while
			//we were failed over.

			//kill this task, allow the newly launched scheduler to schedule the new pod
			log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod)
		}
	} else if errors.IsNotFound(err) {
		// pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok
		log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name)
	} else if errors.IsServerTimeout(err) {
		log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err)
		return
	} else {
		log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err)
		return
	}
	if _, err := driver.KillTask(taskStatus.TaskId); err != nil {
		log.Errorf("failed to kill task %v: %v", taskId, err)
	}
}
Exemple #28
0
func (s *EtcdScheduler) StatusUpdate(
	driver scheduler.SchedulerDriver,
	status *mesos.TaskStatus,
) {
	s.mut.Lock()
	defer s.mut.Unlock()
	log.Infoln(
		"Status update: task",
		status.TaskId.GetValue(),
		" is in state ",
		status.State.Enum().String(),
	)

	node, err := config.Parse(status.GetTaskId().GetValue())
	if err != nil {
		log.Errorf("scheduler: failed to unmarshal config.Node from TaskId: %s", err)
		return
	}
	node.SlaveID = status.SlaveId.GetValue()

	// record that we've heard about this task
	s.heardFrom[status.GetTaskId().GetValue()] = struct{}{}

	switch status.GetState() {
	case mesos.TaskState_TASK_LOST,
		mesos.TaskState_TASK_FINISHED,
		mesos.TaskState_TASK_KILLED,
		mesos.TaskState_TASK_ERROR,
		mesos.TaskState_TASK_FAILED:

		log.Errorf("Task contraction: %+v", status.GetState())
		log.Errorf("message: %s", status.GetMessage())
		log.Errorf("reason: %+v", status.GetReason())

		atomic.AddUint32(&s.Stats.FailedServers, 1)

		// TODO(tyler) kill this
		// Pump the brakes so that we have time to deconfigure the lost node
		// before adding a new one.  If we don't deconfigure first, we risk
		// split brain.
		s.PumpTheBrakes()

		// now we know this task is dead
		delete(s.pending, node.Name)
		delete(s.running, node.Name)
		delete(s.tasks, node.Name)

		// We don't have to clean up the state in ZK for this
		// as it is fine to eventually just persist when we
		// receive a new TASK_RUNNING below.
		delete(s.reconciliationInfo, status.TaskId.GetValue())

		s.QueueLaunchAttempt()

		// TODO(tyler) do we want to lock if the first task fails?
		// TODO(tyler) can we handle a total loss at reconciliation time,
		//             when s.state == Immutable?
		if len(s.running) == 0 && s.state == Mutable {
			log.Error("TOTAL CLUSTER LOSS!  LOCKING SCHEDULER, " +
				"FOLLOW RESTORATION GUIDE AT " +
				"https://github.com/mesosphere/" +
				"etcd-mesos/blob/master/docs/response.md")
			s.state = Immutable
		}
	case mesos.TaskState_TASK_STARTING:
	case mesos.TaskState_TASK_RUNNING:
		// We update data to ZK synchronously because it must happen
		// in-order.  If we spun off a goroutine this would possibly retry
		// and succeed in the wrong order, and older data would win.
		// We keep this simple here, as if ZK is healthy this won't take long.
		// If this takes long, we're probably about to die anyway, as ZK is
		// displeased and mesos-go will panic when it loses contact.
		s.reconciliationInfo[status.TaskId.GetValue()] = status.SlaveId.GetValue()
		err = s.updateReconciliationInfoFunc(
			s.reconciliationInfo,
			s.ZkServers,
			s.ZkChroot,
			s.FrameworkName,
		)
		if err != nil {
			log.Errorf("Failed to persist reconciliation info: %+v", err)
		}

		delete(s.pending, node.Name)
		_, present := s.running[node.Name]
		if !present {
			s.running[node.Name] = node
			s.tasks[node.Name] = status.TaskId
		}

		// During reconcilliation, we may find nodes with higher ID's due to ntp drift
		etcdIndexParts := strings.Split(node.Name, "-")
		if len(etcdIndexParts) != 2 {
			log.Warning("Task has a Name that does not follow the form etcd-<index>")
		} else {
			etcdIndex, err := strconv.ParseInt(etcdIndexParts[1], 10, 64)
			if err != nil {
				log.Warning("Task has a Name that does not follow the form etcd-<index>")
			} else {
				if etcdIndex > s.highestInstanceID {
					s.highestInstanceID = etcdIndex + 1
				}
			}
		}
	default:
		log.Warningf("Received unhandled task state: %+v", status.GetState())
	}
}
Exemple #29
0
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
	for _, task := range sched.tasks {
		if task.name == status.TaskId.GetValue() &&
			(status.GetState() == mesos.TaskState_TASK_FINISHED ||
				status.GetState() == mesos.TaskState_TASK_LOST ||
				status.GetState() == mesos.TaskState_TASK_KILLED ||
				status.GetState() == mesos.TaskState_TASK_FAILED ||
				status.GetState() == mesos.TaskState_TASK_ERROR) {

			// No matter what the outcome was, move to finished state so that we can unreserve resources
			task.state = FinishedState
		}
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_ERROR {
		log.Infoln(
			"Task", status.TaskId.GetValue(),
			"is in unexpected state", status.State.String(),
			"with message", status.GetMessage(),
			". Unreserving resources",
		)
	}
}