func (sched *NoneScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { taskId := status.GetTaskId().GetValue() log.Infoln("Status update: task", taskId, "is in state", status.State.Enum().String()) c := sched.queue.GetCommandById(taskId) if c == nil { log.Errorln("Unable to find command for task", taskId) driver.Abort() } if c.Status.GetState() == status.GetState() { // ignore repeated status updates return } c.Status = status // send status update to CommandHandler if status.GetState() == mesos.TaskState_TASK_RUNNING { sched.handler.CommandRunning(c) } else if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.handler.CommandEnded(c) sched.handler.CommandFinished(c) } else if status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED { sched.handler.CommandEnded(c) sched.handler.CommandFailed(c) } // stop if Commands channel was closed and all tasks are finished if sched.queue.Closed() && !sched.handler.HasRunningTasks() { log.Infoln("All tasks finished, stopping framework.") sched.handler.FinishAllCommands() driver.Stop(false) } }
func (sched *MesosRunonceScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.V(1).Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) eventCh <- status if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ } if sched.tasksFinished >= sched.totalTasks { log.V(1).Infoln("Total tasks completed, stopping framework.") driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { exitStatus = 1 log.Warningf("mesos TaskStatus: %v", status) driver.Stop(false) log.Errorln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message.", status.GetMessage(), ) } }
func (s *SchedulerServer) failover(driver bindings.SchedulerDriver, hks hyperkube.Interface) error { if driver != nil { stat, err := driver.Stop(true) if stat != mesos.Status_DRIVER_STOPPED { return fmt.Errorf("failed to stop driver for failover, received unexpected status code: %v", stat) } else if err != nil { return err } } // there's no guarantee that all goroutines are actually programmed intelligently with 'done' // signals, so we'll need to restart if we want to really stop everything // run the same command that we were launched with //TODO(jdef) assumption here is that the sheduler is the only service running in this process, we should probably validate that somehow args := []string{} flags := pflag.CommandLine if hks != nil { args = append(args, hks.Name()) flags = hks.Flags() } flags.Visit(func(flag *pflag.Flag) { if flag.Name != "api-servers" && flag.Name != "etcd-servers" { args = append(args, fmt.Sprintf("--%s=%s", flag.Name, flag.Value.String())) } }) if !s.Graceful { args = append(args, "--graceful") } if len(s.APIServerList) > 0 { args = append(args, "--api-servers="+strings.Join(s.APIServerList, ",")) } if len(s.EtcdServerList) > 0 { args = append(args, "--etcd-servers="+strings.Join(s.EtcdServerList, ",")) } args = append(args, flags.Args()...) log.V(1).Infof("spawning scheduler for graceful failover: %s %+v", s.executable, args) cmd := exec.Command(s.executable, args...) cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr cmd.SysProcAttr = makeDisownedProcAttr() // TODO(jdef) pass in a pipe FD so that we can block, waiting for the child proc to be ready //cmd.ExtraFiles = []*os.File{} exitcode := 0 log.Flush() // TODO(jdef) it would be really nice to ensure that no one else in our process was still logging if err := cmd.Start(); err != nil { //log to stdtout here to avoid conflicts with normal stderr logging fmt.Fprintf(os.Stdout, "failed to spawn failover process: %v\n", err) os.Exit(1) } os.Exit(exitcode) select {} // will never reach here }
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ driver.ReviveOffers() // TODO(jdef) rate-limit this } if sched.tasksFinished >= sched.totalTasks { log.Infoln("Total tasks completed, stopping framework.") driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { sched.tasksErrored++ } }
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ } if sched.tasksFinished >= sched.totalTasks { log.Infoln("Total tasks completed, stopping framework.") driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } }
func (sched *ExampleScheduler) ResourceOffers(driver sched.SchedulerDriver, offers []*mesos.Offer) { for _, offer := range offers { operations := []*mesos.Offer_Operation{} resourcesToCreate := []*mesos.Resource{} resourcesToDestroy := []*mesos.Resource{} resourcesToReserve := []*mesos.Resource{} resourcesToUnreserve := []*mesos.Resource{} taskInfosToLaunch := []*mesos.TaskInfo{} totalUnreserved := 0 unreservedCpus, unreservedMem, unreservedDisk := getUnreservedResources(offer.Resources) reservedCpus, reservedMem, reservedDisk := getReservedResources(offer.Resources) log.Infoln("Received Offer <", offer.Id.GetValue(), "> with unreserved cpus=", unreservedCpus, " mem=", unreservedMem, " disk=", unreservedDisk) log.Infoln("Received Offer <", offer.Id.GetValue(), "> with reserved cpus=", reservedCpus, " mem=", reservedMem, " disk=", reservedDisk) for _, task := range sched.tasks { switch task.state { case InitState: if CPUS_PER_TASK <= unreservedCpus && MEM_PER_TASK <= unreservedMem && DISK_PER_TASK <= unreservedDisk { resourcesToReserve = append(resourcesToReserve, []*mesos.Resource{ util.NewScalarResourceWithReservation("cpus", CPUS_PER_TASK, *mesosAuthPrincipal, *role), util.NewScalarResourceWithReservation("mem", MEM_PER_TASK, *mesosAuthPrincipal, *role), util.NewScalarResourceWithReservation("disk", DISK_PER_TASK, *mesosAuthPrincipal, *role), }...) resourcesToCreate = append(resourcesToCreate, util.NewVolumeResourceWithReservation(DISK_PER_TASK, task.containerPath, task.persistenceId, mesos.Volume_RW.Enum(), *mesosAuthPrincipal, *role)) task.state = RequestedReservationState unreservedCpus = unreservedCpus - CPUS_PER_TASK unreservedMem = unreservedMem - MEM_PER_TASK unreservedDisk = unreservedDisk - DISK_PER_TASK } case RequestedReservationState: if CPUS_PER_TASK <= reservedCpus && MEM_PER_TASK <= reservedMem && DISK_PER_TASK <= reservedDisk && resourcesHaveVolume(offer.Resources, task.persistenceId) { taskId := &mesos.TaskID{ Value: proto.String(task.name), } taskInfo := &mesos.TaskInfo{ Name: proto.String("go-task-" + taskId.GetValue()), TaskId: taskId, SlaveId: offer.SlaveId, Executor: task.executor, Resources: []*mesos.Resource{ util.NewScalarResourceWithReservation("cpus", CPUS_PER_TASK, *mesosAuthPrincipal, *role), util.NewScalarResourceWithReservation("mem", MEM_PER_TASK, *mesosAuthPrincipal, *role), util.NewVolumeResourceWithReservation(DISK_PER_TASK, task.containerPath, task.persistenceId, mesos.Volume_RW.Enum(), *mesosAuthPrincipal, *role), }, } taskInfosToLaunch = append(taskInfosToLaunch, taskInfo) task.state = LaunchedState reservedCpus = reservedCpus - CPUS_PER_TASK reservedMem = reservedMem - MEM_PER_TASK reservedDisk = reservedDisk - DISK_PER_TASK log.Infof("Prepared task: %s with offer %s for launch\n", taskInfo.GetName(), offer.Id.GetValue()) } case FinishedState: resourcesToDestroy = append(resourcesToDestroy, util.NewVolumeResourceWithReservation(DISK_PER_TASK, task.containerPath, task.persistenceId, mesos.Volume_RW.Enum(), *mesosAuthPrincipal, *role)) resourcesToUnreserve = append(resourcesToUnreserve, []*mesos.Resource{ util.NewScalarResourceWithReservation("cpus", CPUS_PER_TASK, *mesosAuthPrincipal, *role), util.NewScalarResourceWithReservation("mem", MEM_PER_TASK, *mesosAuthPrincipal, *role), util.NewScalarResourceWithReservation("disk", DISK_PER_TASK, *mesosAuthPrincipal, *role), }...) task.state = UnreservedState case UnreservedState: totalUnreserved = totalUnreserved + 1 } } // Clean up reservations we no longer need if len(resourcesToReserve) == 0 && len(resourcesToCreate) == 0 && len(taskInfosToLaunch) == 0 { if reservedCpus >= 0.0 { resourcesToUnreserve = append(resourcesToUnreserve, util.NewScalarResourceWithReservation("cpus", reservedCpus, *mesosAuthPrincipal, *role)) } if reservedMem >= 0.0 { resourcesToUnreserve = append(resourcesToUnreserve, util.NewScalarResourceWithReservation("mem", reservedCpus, *mesosAuthPrincipal, *role)) } if reservedDisk >= 0.0 { filtered := util.FilterResources(offer.Resources, func(res *mesos.Resource) bool { return res.GetName() == "disk" && res.Reservation != nil && res.Disk != nil }) for _, volume := range filtered { resourcesToDestroy = append(resourcesToDestroy, util.NewVolumeResourceWithReservation( volume.GetScalar().GetValue(), volume.Disk.Volume.GetContainerPath(), volume.Disk.Persistence.GetId(), volume.Disk.Volume.Mode, *mesosAuthPrincipal, *role)) } resourcesToUnreserve = append(resourcesToUnreserve, util.NewScalarResourceWithReservation("mem", reservedDisk, *mesosAuthPrincipal, *role)) } } // Make a single operation per type if len(resourcesToReserve) > 0 { operations = append(operations, util.NewReserveOperation(resourcesToReserve)) } if len(resourcesToCreate) > 0 { operations = append(operations, util.NewCreateOperation(resourcesToCreate)) } if len(resourcesToUnreserve) > 0 { operations = append(operations, util.NewUnreserveOperation(resourcesToUnreserve)) } if len(resourcesToDestroy) > 0 { operations = append(operations, util.NewDestroyOperation(resourcesToDestroy)) } if len(taskInfosToLaunch) > 0 { operations = append(operations, util.NewLaunchOperation(taskInfosToLaunch)) } log.Infoln("Accepting offers with ", len(operations), "operations for offer", offer.Id.GetValue()) refuseSeconds := 5.0 if len(operations) == 0 { refuseSeconds = 5.0 } driver.AcceptOffers([]*mesos.OfferID{offer.Id}, operations, &mesos.Filters{RefuseSeconds: proto.Float64(refuseSeconds)}) if totalUnreserved >= len(sched.tasks) { log.Infoln("Total tasks completed and unreserved, stopping framework.") driver.Stop(false) } } }