Example #1
0
func (sched *NoneScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	taskId := status.GetTaskId().GetValue()
	log.Infoln("Status update: task", taskId, "is in state", status.State.Enum().String())

	c := sched.queue.GetCommandById(taskId)
	if c == nil {
		log.Errorln("Unable to find command for task", taskId)
		driver.Abort()
	}
	if c.Status.GetState() == status.GetState() {
		// ignore repeated status updates
		return
	}
	c.Status = status

	// send status update to CommandHandler
	if status.GetState() == mesos.TaskState_TASK_RUNNING {
		sched.handler.CommandRunning(c)
	} else if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.handler.CommandEnded(c)
		sched.handler.CommandFinished(c)
	} else if status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED {
		sched.handler.CommandEnded(c)
		sched.handler.CommandFailed(c)
	}

	// stop if Commands channel was closed and all tasks are finished
	if sched.queue.Closed() && !sched.handler.HasRunningTasks() {
		log.Infoln("All tasks finished, stopping framework.")
		sched.handler.FinishAllCommands()
		driver.Stop(false)
	}
}
Example #2
0
func (sched *MesosRunonceScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.V(1).Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
	eventCh <- status

	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
	}

	if sched.tasksFinished >= sched.totalTasks {
		log.V(1).Infoln("Total tasks completed, stopping framework.")
		driver.Stop(false)
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_ERROR {
		exitStatus = 1
		log.Warningf("mesos TaskStatus: %v", status)
		driver.Stop(false)
		log.Errorln(
			"Aborting because task", status.TaskId.GetValue(),
			"is in unexpected state", status.State.String(),
			"with message.", status.GetMessage(),
		)
	}
}
Example #3
0
func (s *SchedulerServer) failover(driver bindings.SchedulerDriver, hks hyperkube.Interface) error {
	if driver != nil {
		stat, err := driver.Stop(true)
		if stat != mesos.Status_DRIVER_STOPPED {
			return fmt.Errorf("failed to stop driver for failover, received unexpected status code: %v", stat)
		} else if err != nil {
			return err
		}
	}

	// there's no guarantee that all goroutines are actually programmed intelligently with 'done'
	// signals, so we'll need to restart if we want to really stop everything

	// run the same command that we were launched with
	//TODO(jdef) assumption here is that the sheduler is the only service running in this process, we should probably validate that somehow
	args := []string{}
	flags := pflag.CommandLine
	if hks != nil {
		args = append(args, hks.Name())
		flags = hks.Flags()
	}
	flags.Visit(func(flag *pflag.Flag) {
		if flag.Name != "api-servers" && flag.Name != "etcd-servers" {
			args = append(args, fmt.Sprintf("--%s=%s", flag.Name, flag.Value.String()))
		}
	})
	if !s.Graceful {
		args = append(args, "--graceful")
	}
	if len(s.APIServerList) > 0 {
		args = append(args, "--api-servers="+strings.Join(s.APIServerList, ","))
	}
	if len(s.EtcdServerList) > 0 {
		args = append(args, "--etcd-servers="+strings.Join(s.EtcdServerList, ","))
	}
	args = append(args, flags.Args()...)

	log.V(1).Infof("spawning scheduler for graceful failover: %s %+v", s.executable, args)

	cmd := exec.Command(s.executable, args...)
	cmd.Stdin = os.Stdin
	cmd.Stdout = os.Stdout
	cmd.Stderr = os.Stderr
	cmd.SysProcAttr = makeDisownedProcAttr()

	// TODO(jdef) pass in a pipe FD so that we can block, waiting for the child proc to be ready
	//cmd.ExtraFiles = []*os.File{}

	exitcode := 0
	log.Flush() // TODO(jdef) it would be really nice to ensure that no one else in our process was still logging
	if err := cmd.Start(); err != nil {
		//log to stdtout here to avoid conflicts with normal stderr logging
		fmt.Fprintf(os.Stdout, "failed to spawn failover process: %v\n", err)
		os.Exit(1)
	}
	os.Exit(exitcode)
	select {} // will never reach here
}
Example #4
0
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
		driver.ReviveOffers() // TODO(jdef) rate-limit this
	}

	if sched.tasksFinished >= sched.totalTasks {
		log.Infoln("Total tasks completed, stopping framework.")
		driver.Stop(false)
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_ERROR {
		sched.tasksErrored++
	}
}
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
	}

	if sched.tasksFinished >= sched.totalTasks {
		log.Infoln("Total tasks completed, stopping framework.")
		driver.Stop(false)
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED {
		log.Infoln(
			"Aborting because task", status.TaskId.GetValue(),
			"is in unexpected state", status.State.String(),
			"with message", status.GetMessage(),
		)
		driver.Abort()
	}
}
Example #6
0
func (sched *ExampleScheduler) ResourceOffers(driver sched.SchedulerDriver, offers []*mesos.Offer) {
	for _, offer := range offers {
		operations := []*mesos.Offer_Operation{}
		resourcesToCreate := []*mesos.Resource{}
		resourcesToDestroy := []*mesos.Resource{}
		resourcesToReserve := []*mesos.Resource{}
		resourcesToUnreserve := []*mesos.Resource{}
		taskInfosToLaunch := []*mesos.TaskInfo{}
		totalUnreserved := 0

		unreservedCpus, unreservedMem, unreservedDisk := getUnreservedResources(offer.Resources)
		reservedCpus, reservedMem, reservedDisk := getReservedResources(offer.Resources)

		log.Infoln("Received Offer <", offer.Id.GetValue(), "> with unreserved cpus=", unreservedCpus, " mem=", unreservedMem, " disk=", unreservedDisk)
		log.Infoln("Received Offer <", offer.Id.GetValue(), "> with reserved cpus=", reservedCpus, " mem=", reservedMem, " disk=", reservedDisk)

		for _, task := range sched.tasks {
			switch task.state {
			case InitState:
				if CPUS_PER_TASK <= unreservedCpus &&
					MEM_PER_TASK <= unreservedMem &&
					DISK_PER_TASK <= unreservedDisk {

					resourcesToReserve = append(resourcesToReserve, []*mesos.Resource{
						util.NewScalarResourceWithReservation("cpus", CPUS_PER_TASK, *mesosAuthPrincipal, *role),
						util.NewScalarResourceWithReservation("mem", MEM_PER_TASK, *mesosAuthPrincipal, *role),
						util.NewScalarResourceWithReservation("disk", DISK_PER_TASK, *mesosAuthPrincipal, *role),
					}...)
					resourcesToCreate = append(resourcesToCreate,
						util.NewVolumeResourceWithReservation(DISK_PER_TASK, task.containerPath, task.persistenceId, mesos.Volume_RW.Enum(), *mesosAuthPrincipal, *role))
					task.state = RequestedReservationState
					unreservedCpus = unreservedCpus - CPUS_PER_TASK
					unreservedMem = unreservedMem - MEM_PER_TASK
					unreservedDisk = unreservedDisk - DISK_PER_TASK
				}
			case RequestedReservationState:
				if CPUS_PER_TASK <= reservedCpus &&
					MEM_PER_TASK <= reservedMem &&
					DISK_PER_TASK <= reservedDisk &&
					resourcesHaveVolume(offer.Resources, task.persistenceId) {

					taskId := &mesos.TaskID{
						Value: proto.String(task.name),
					}

					taskInfo := &mesos.TaskInfo{
						Name:     proto.String("go-task-" + taskId.GetValue()),
						TaskId:   taskId,
						SlaveId:  offer.SlaveId,
						Executor: task.executor,
						Resources: []*mesos.Resource{
							util.NewScalarResourceWithReservation("cpus", CPUS_PER_TASK, *mesosAuthPrincipal, *role),
							util.NewScalarResourceWithReservation("mem", MEM_PER_TASK, *mesosAuthPrincipal, *role),
							util.NewVolumeResourceWithReservation(DISK_PER_TASK, task.containerPath, task.persistenceId, mesos.Volume_RW.Enum(), *mesosAuthPrincipal, *role),
						},
					}

					taskInfosToLaunch = append(taskInfosToLaunch, taskInfo)
					task.state = LaunchedState
					reservedCpus = reservedCpus - CPUS_PER_TASK
					reservedMem = reservedMem - MEM_PER_TASK
					reservedDisk = reservedDisk - DISK_PER_TASK

					log.Infof("Prepared task: %s with offer %s for launch\n", taskInfo.GetName(), offer.Id.GetValue())
				}
			case FinishedState:
				resourcesToDestroy = append(resourcesToDestroy,
					util.NewVolumeResourceWithReservation(DISK_PER_TASK, task.containerPath, task.persistenceId, mesos.Volume_RW.Enum(), *mesosAuthPrincipal, *role))
				resourcesToUnreserve = append(resourcesToUnreserve, []*mesos.Resource{
					util.NewScalarResourceWithReservation("cpus", CPUS_PER_TASK, *mesosAuthPrincipal, *role),
					util.NewScalarResourceWithReservation("mem", MEM_PER_TASK, *mesosAuthPrincipal, *role),
					util.NewScalarResourceWithReservation("disk", DISK_PER_TASK, *mesosAuthPrincipal, *role),
				}...)
				task.state = UnreservedState
			case UnreservedState:
				totalUnreserved = totalUnreserved + 1
			}
		}

		// Clean up reservations we no longer need
		if len(resourcesToReserve) == 0 && len(resourcesToCreate) == 0 && len(taskInfosToLaunch) == 0 {
			if reservedCpus >= 0.0 {
				resourcesToUnreserve = append(resourcesToUnreserve, util.NewScalarResourceWithReservation("cpus", reservedCpus, *mesosAuthPrincipal, *role))
			}
			if reservedMem >= 0.0 {
				resourcesToUnreserve = append(resourcesToUnreserve, util.NewScalarResourceWithReservation("mem", reservedCpus, *mesosAuthPrincipal, *role))
			}
			if reservedDisk >= 0.0 {
				filtered := util.FilterResources(offer.Resources, func(res *mesos.Resource) bool {
					return res.GetName() == "disk" &&
						res.Reservation != nil &&
						res.Disk != nil
				})
				for _, volume := range filtered {
					resourcesToDestroy = append(resourcesToDestroy,
						util.NewVolumeResourceWithReservation(
							volume.GetScalar().GetValue(),
							volume.Disk.Volume.GetContainerPath(),
							volume.Disk.Persistence.GetId(),
							volume.Disk.Volume.Mode,
							*mesosAuthPrincipal,
							*role))
				}
				resourcesToUnreserve = append(resourcesToUnreserve, util.NewScalarResourceWithReservation("mem", reservedDisk, *mesosAuthPrincipal, *role))
			}
		}

		// Make a single operation per type
		if len(resourcesToReserve) > 0 {
			operations = append(operations, util.NewReserveOperation(resourcesToReserve))
		}
		if len(resourcesToCreate) > 0 {
			operations = append(operations, util.NewCreateOperation(resourcesToCreate))
		}
		if len(resourcesToUnreserve) > 0 {
			operations = append(operations, util.NewUnreserveOperation(resourcesToUnreserve))
		}
		if len(resourcesToDestroy) > 0 {
			operations = append(operations, util.NewDestroyOperation(resourcesToDestroy))
		}
		if len(taskInfosToLaunch) > 0 {
			operations = append(operations, util.NewLaunchOperation(taskInfosToLaunch))
		}

		log.Infoln("Accepting offers with ", len(operations), "operations for offer", offer.Id.GetValue())
		refuseSeconds := 5.0
		if len(operations) == 0 {
			refuseSeconds = 5.0
		}
		driver.AcceptOffers([]*mesos.OfferID{offer.Id}, operations, &mesos.Filters{RefuseSeconds: proto.Float64(refuseSeconds)})

		if totalUnreserved >= len(sched.tasks) {
			log.Infoln("Total tasks completed and unreserved, stopping framework.")
			driver.Stop(false)
		}
	}
}