// StatusUpdate takes care of updating the status func (s *eremeticScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { id := status.TaskId.GetValue() log.Debugf("Received task status [%s] for task [%s]", status.State.String(), id) task, err := database.ReadTask(id) if err != nil { log.Debugf("Error reading task from database: %s", err) } if task.ID == "" { task = types.EremeticTask{ ID: id, SlaveId: status.SlaveId.GetValue(), } } if !task.IsRunning() && *status.State == mesos.TaskState_TASK_RUNNING { TasksRunning.Inc() } if types.IsTerminal(status.State) { TasksTerminated.With(prometheus.Labels{"status": status.State.String()}).Inc() if task.WasRunning() { TasksRunning.Dec() } } task.UpdateStatus(types.Status{ Status: status.State.String(), Time: time.Now().Unix(), }) if *status.State == mesos.TaskState_TASK_FAILED && !task.WasRunning() { if task.Retry >= maxRetries { log.Warnf("giving up on %s after %d retry attempts", id, task.Retry) } else { log.Infof("task %s was never running. re-scheduling", id) task.UpdateStatus(types.Status{ Status: mesos.TaskState_TASK_STAGING.String(), Time: time.Now().Unix(), }) task.Retry++ go func() { QueueSize.Inc() s.tasks <- id }() } } if types.IsTerminal(status.State) { handler.NotifyCallback(&task) } database.PutTask(&task) }
// GetTaskInfo returns information about the given task. func GetTaskInfo(scheduler types.Scheduler) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) id := vars["taskId"] log.Debugf("Fetching task for id: %s", id) task, _ := database.ReadTask(id) if strings.Contains(r.Header.Get("Accept"), "text/html") { renderHTML(w, r, task, id) } else { if reflect.DeepEqual(task, (types.EremeticTask{})) { writeJSON(http.StatusNotFound, nil, w) return } writeJSON(http.StatusOK, task, w) } } }
// ResourceOffers handles the Resource Offers func (s *eremeticScheduler) ResourceOffers(driver sched.SchedulerDriver, offers []*mesos.Offer) { log.Tracef("Received %d resource offers", len(offers)) var offer *mesos.Offer loop: for len(offers) > 0 { select { case <-s.shutdown: log.Info("Shutting down: declining offers") break loop case tid := <-s.tasks: log.Debugf("Trying to find offer to launch %s with", tid) t, _ := database.ReadTask(tid) offer, offers = matchOffer(t, offers) if offer == nil { log.Warnf("Could not find a matching offer for %s", tid) TasksDelayed.Inc() go func() { s.tasks <- tid }() break loop } log.Debugf("Preparing to launch task %s with offer %s", tid, offer.Id.GetValue()) t, task := s.newTask(t, offer) database.PutTask(&t) driver.LaunchTasks([]*mesos.OfferID{offer.Id}, []*mesos.TaskInfo{task}, defaultFilter) TasksLaunched.Inc() QueueSize.Dec() continue default: break loop } } log.Trace("No tasks to launch. Declining offers.") for _, offer := range offers { driver.DeclineOffer(offer.Id, defaultFilter) } }
func ReconcileTasks(driver sched.SchedulerDriver) *Reconcile { cancel := make(chan struct{}) done := make(chan struct{}) go func() { var ( c uint delay int ) tasks, err := database.ListNonTerminalTasks() if err != nil { log.Errorf("Failed to list non-terminal tasks: %s", err) close(done) return } log.Infof("Trying to reconcile with %d task(s)", len(tasks)) start := time.Now() for len(tasks) > 0 { select { case <-cancel: log.Info("Cancelling reconciliation job") close(done) return case <-time.After(time.Duration(delay) * time.Second): // Filter tasks that has received a status update ntasks := []*types.EremeticTask{} for _, t := range tasks { nt, err := database.ReadTask(t.ID) if err != nil { log.Warnf("Task %s not found in database", t.ID) continue } if nt.LastUpdated().Before(start) { ntasks = append(ntasks, &nt) } } tasks = ntasks // Send reconciliation request if len(tasks) > 0 { var statuses []*mesos.TaskStatus for _, t := range tasks { statuses = append(statuses, &mesos.TaskStatus{ State: mesos.TaskState_TASK_STAGING.Enum(), TaskId: &mesos.TaskID{Value: proto.String(t.ID)}, SlaveId: &mesos.SlaveID{Value: proto.String(t.SlaveId)}, }) } log.Debugf("Sending reconciliation request #%d", c) driver.ReconcileTasks(statuses) } if delay < maxReconciliationDelay { delay = 10 << c if delay >= maxReconciliationDelay { delay = maxReconciliationDelay } } c += 1 } } log.Info("Reconciliation done") close(done) }() return &Reconcile{ cancel: cancel, done: done, } }
func TestReconcile(t *testing.T) { dir, _ := os.Getwd() database.NewDB(fmt.Sprintf("%s/../db/test.db", dir)) database.Clean() defer database.Close() maxReconciliationDelay = 1 Convey("ReconcileTasks", t, func() { Convey("Finishes when there are no tasks", func() { driver := NewMockScheduler() r := ReconcileTasks(driver) select { case <-r.done: } So(driver.AssertNotCalled(t, "ReconcileTasks"), ShouldBeTrue) }) Convey("Sends reconcile request", func() { driver := NewMockScheduler() driver.On("ReconcileTasks").Run(func(mock.Arguments) { t, err := database.ReadTask("1234") if err != nil { panic("mock error") } t.UpdateStatus(types.Status{ Status: mesos.TaskState_TASK_RUNNING.String(), Time: time.Now().Unix() + 1, }) database.PutTask(&t) }).Once() database.PutTask(&types.EremeticTask{ ID: "1234", Status: []types.Status{ types.Status{ Status: mesos.TaskState_TASK_STAGING.String(), Time: time.Now().Unix(), }, }, }) r := ReconcileTasks(driver) select { case <-r.done: } So(driver.AssertCalled(t, "ReconcileTasks"), ShouldBeTrue) }) Convey("Cancel reconciliation", func() { driver := NewMockScheduler() database.PutTask(&types.EremeticTask{ ID: "1234", Status: []types.Status{ types.Status{ Status: mesos.TaskState_TASK_STAGING.String(), Time: time.Now().Unix(), }, }, }) r := ReconcileTasks(driver) r.Cancel() select { case <-r.done: } So(driver.AssertNotCalled(t, "ReconcileTasks"), ShouldBeTrue) }) }) }
func TestScheduler(t *testing.T) { dir, _ := os.Getwd() database.NewDB(fmt.Sprintf("%s/../db/test.db", dir)) database.Clean() defer database.Close() Convey("eremeticScheduler", t, func() { s := eremeticScheduler{} id := "eremetic-task.9999" database.PutTask(&types.EremeticTask{ID: id}) Convey("newTask", func() { task := types.EremeticTask{ ID: "eremetic-task.1234", } offer := mesos.Offer{ FrameworkId: &mesos.FrameworkID{ Value: proto.String("framework-id"), }, SlaveId: &mesos.SlaveID{ Value: proto.String("slave-id"), }, Hostname: proto.String("hostname"), } taskData, mesosTask := s.newTask(task, &offer) So(mesosTask.GetTaskId().GetValue(), ShouldEqual, task.ID) So(taskData.SlaveId, ShouldEqual, "slave-id") }) Convey("createEremeticScheduler", func() { s := createEremeticScheduler() So(s.tasksCreated, ShouldEqual, 0) }) Convey("API", func() { Convey("Registered", func() { driver := NewMockScheduler() driver.On("ReconcileTasks").Return("ok").Once() fID := mesos.FrameworkID{Value: proto.String("1234")} mInfo := mesos.MasterInfo{} s.Registered(driver, &fID, &mInfo) So(driver.AssertCalled(t, "ReconcileTasks"), ShouldBeTrue) }) Convey("Reregistered", func() { driver := NewMockScheduler() driver.On("ReconcileTasks").Return("ok").Once() database.Clean() s.Reregistered(driver, &mesos.MasterInfo{}) So(driver.AssertCalled(t, "ReconcileTasks"), ShouldBeTrue) }) Convey("Disconnected", func() { s.Disconnected(nil) }) Convey("ResourceOffers", func() { driver := NewMockScheduler() var offers []*mesos.Offer Convey("No offers", func() { s.ResourceOffers(driver, offers) So(driver.AssertNotCalled(t, "DeclineOffer"), ShouldBeTrue) So(driver.AssertNotCalled(t, "LaunchTasks"), ShouldBeTrue) }) Convey("No tasks", func() { offers = append(offers, &mesos.Offer{Id: &mesos.OfferID{Value: proto.String("1234")}}) driver.On("DeclineOffer").Return("declined").Once() s.ResourceOffers(driver, offers) So(driver.AssertCalled(t, "DeclineOffer"), ShouldBeTrue) So(driver.AssertNotCalled(t, "LaunchTasks"), ShouldBeTrue) }) }) Convey("StatusUpdate", func() { Convey("Running then failing", func() { s.StatusUpdate(nil, &mesos.TaskStatus{ TaskId: &mesos.TaskID{ Value: proto.String(id), }, State: mesos.TaskState_TASK_RUNNING.Enum(), }) task, _ := database.ReadTask(id) So(len(task.Status), ShouldEqual, 1) So(task.Status[0].Status, ShouldEqual, mesos.TaskState_TASK_RUNNING.String()) s.StatusUpdate(nil, &mesos.TaskStatus{ TaskId: &mesos.TaskID{ Value: proto.String(id), }, State: mesos.TaskState_TASK_FAILED.Enum(), }) task, _ = database.ReadTask(id) So(len(task.Status), ShouldEqual, 2) So(task.Status[0].Status, ShouldEqual, mesos.TaskState_TASK_RUNNING.String()) So(task.Status[1].Status, ShouldEqual, mesos.TaskState_TASK_FAILED.String()) }) Convey("Failing immediatly", func() { s.tasks = make(chan string, 100) s.StatusUpdate(nil, &mesos.TaskStatus{ TaskId: &mesos.TaskID{ Value: proto.String(id), }, State: mesos.TaskState_TASK_FAILED.Enum(), }) task, _ := database.ReadTask(id) So(len(task.Status), ShouldEqual, 2) So(task.Status[0].Status, ShouldEqual, mesos.TaskState_TASK_FAILED.String()) So(task.Status[1].Status, ShouldEqual, mesos.TaskState_TASK_STAGING.String()) select { case c := <-s.tasks: So(c, ShouldEqual, id) } }) }) Convey("FrameworkMessage", func() { driver := NewMockScheduler() message := `{"message": "this is a message"}` Convey("From Eremetic", func() { source := "eremetic-executor" executor := mesos.ExecutorID{ Value: proto.String(source), } s.FrameworkMessage(driver, &executor, &mesos.SlaveID{}, message) }) Convey("From an unknown source", func() { source := "other-source" executor := mesos.ExecutorID{ Value: proto.String(source), } s.FrameworkMessage(driver, &executor, &mesos.SlaveID{}, message) }) Convey("A bad json", func() { source := "eremetic-executor" executor := mesos.ExecutorID{ Value: proto.String(source), } s.FrameworkMessage(driver, &executor, &mesos.SlaveID{}, "not a json") }) }) Convey("OfferRescinded", func() { s.OfferRescinded(nil, &mesos.OfferID{}) }) Convey("SlaveLost", func() { s.SlaveLost(nil, &mesos.SlaveID{}) }) Convey("ExecutorLost", func() { s.ExecutorLost(nil, &mesos.ExecutorID{}, &mesos.SlaveID{}, 2) }) Convey("Error", func() { s.Error(nil, "Error") }) }) }) Convey("ScheduleTask", t, func() { Convey("Given a valid Request", func() { scheduler := &eremeticScheduler{ tasks: make(chan string, 100), } request := types.Request{ TaskCPUs: 0.5, TaskMem: 22.0, DockerImage: "busybox", Command: "echo hello", } Convey("It should put a task id on the channel", func() { taskID, err := scheduler.ScheduleTask(request) So(err, ShouldBeNil) select { case c := <-scheduler.tasks: So(c, ShouldEqual, taskID) task, _ := database.ReadTask(taskID) So(task.TaskCPUs, ShouldEqual, request.TaskCPUs) So(task.TaskMem, ShouldEqual, request.TaskMem) So(task.Command, ShouldEqual, request.Command) So(task.User, ShouldEqual, "root") So(task.Environment, ShouldBeEmpty) So(task.Image, ShouldEqual, request.DockerImage) So(task.ID, ShouldStartWith, "eremetic-task.") } }) }) }) }