// StatusUpdate takes care of updating the status func (s *eremeticScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { id := status.TaskId.GetValue() log.Debugf("Received task status [%s] for task [%s]", status.State.String(), id) task, err := database.ReadTask(id) if err != nil { log.Debugf("Error reading task from database: %s", err) } if task.ID == "" { task = types.EremeticTask{ ID: id, SlaveId: status.SlaveId.GetValue(), } } if !task.IsRunning() && *status.State == mesos.TaskState_TASK_RUNNING { TasksRunning.Inc() } if types.IsTerminal(status.State) { TasksTerminated.With(prometheus.Labels{"status": status.State.String()}).Inc() if task.WasRunning() { TasksRunning.Dec() } } task.UpdateStatus(types.Status{ Status: status.State.String(), Time: time.Now().Unix(), }) if *status.State == mesos.TaskState_TASK_FAILED && !task.WasRunning() { if task.Retry >= maxRetries { log.Warnf("giving up on %s after %d retry attempts", id, task.Retry) } else { log.Infof("task %s was never running. re-scheduling", id) task.UpdateStatus(types.Status{ Status: mesos.TaskState_TASK_STAGING.String(), Time: time.Now().Unix(), }) task.Retry++ go func() { QueueSize.Inc() s.tasks <- id }() } } if types.IsTerminal(status.State) { handler.NotifyCallback(&task) } database.PutTask(&task) }
// ResourceOffers handles the Resource Offers func (s *eremeticScheduler) ResourceOffers(driver sched.SchedulerDriver, offers []*mesos.Offer) { log.Tracef("Received %d resource offers", len(offers)) var offer *mesos.Offer loop: for len(offers) > 0 { select { case <-s.shutdown: log.Info("Shutting down: declining offers") break loop case tid := <-s.tasks: log.Debugf("Trying to find offer to launch %s with", tid) t, _ := database.ReadTask(tid) offer, offers = matchOffer(t, offers) if offer == nil { log.Warnf("Could not find a matching offer for %s", tid) TasksDelayed.Inc() go func() { s.tasks <- tid }() break loop } log.Debugf("Preparing to launch task %s with offer %s", tid, offer.Id.GetValue()) t, task := s.newTask(t, offer) database.PutTask(&t) driver.LaunchTasks([]*mesos.OfferID{offer.Id}, []*mesos.TaskInfo{task}, defaultFilter) TasksLaunched.Inc() QueueSize.Dec() continue default: break loop } } log.Trace("No tasks to launch. Declining offers.") for _, offer := range offers { driver.DeclineOffer(offer.Id, defaultFilter) } }
func ReconcileTasks(driver sched.SchedulerDriver) *Reconcile { cancel := make(chan struct{}) done := make(chan struct{}) go func() { var ( c uint delay int ) tasks, err := database.ListNonTerminalTasks() if err != nil { log.Errorf("Failed to list non-terminal tasks: %s", err) close(done) return } log.Infof("Trying to reconcile with %d task(s)", len(tasks)) start := time.Now() for len(tasks) > 0 { select { case <-cancel: log.Info("Cancelling reconciliation job") close(done) return case <-time.After(time.Duration(delay) * time.Second): // Filter tasks that has received a status update ntasks := []*types.EremeticTask{} for _, t := range tasks { nt, err := database.ReadTask(t.ID) if err != nil { log.Warnf("Task %s not found in database", t.ID) continue } if nt.LastUpdated().Before(start) { ntasks = append(ntasks, &nt) } } tasks = ntasks // Send reconciliation request if len(tasks) > 0 { var statuses []*mesos.TaskStatus for _, t := range tasks { statuses = append(statuses, &mesos.TaskStatus{ State: mesos.TaskState_TASK_STAGING.Enum(), TaskId: &mesos.TaskID{Value: proto.String(t.ID)}, SlaveId: &mesos.SlaveID{Value: proto.String(t.SlaveId)}, }) } log.Debugf("Sending reconciliation request #%d", c) driver.ReconcileTasks(statuses) } if delay < maxReconciliationDelay { delay = 10 << c if delay >= maxReconciliationDelay { delay = maxReconciliationDelay } } c += 1 } } log.Info("Reconciliation done") close(done) }() return &Reconcile{ cancel: cancel, done: done, } }