func main() { if len(os.Args) == 2 && os.Args[1] == "--version" { fmt.Println(Version) os.Exit(0) } readConfig() setupLogging() setupMetrics() defer database.Close() bind := fmt.Sprintf("%s:%d", viper.GetString("address"), viper.GetInt("port")) // Catch interrupt go func() { c := make(chan os.Signal, 1) signal.Notify(c, os.Interrupt, os.Kill) s := <-c if s != os.Interrupt && s != os.Kill { return } log.Info("Eremetic is shutting down") os.Exit(0) }() sched := scheduler.Create() router := routes.Create(sched) log.Infof("listening to %s", bind) go scheduler.Run(sched) err := http.ListenAndServe(bind, router) if err != nil { log.Error(err.Error()) os.Exit(1) } }
// StatusUpdate takes care of updating the status func (s *eremeticScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { id := status.TaskId.GetValue() log.Debugf("Received task status [%s] for task [%s]", status.State.String(), id) task, err := database.ReadTask(id) if err != nil { log.Debugf("Error reading task from database: %s", err) } if task.ID == "" { task = types.EremeticTask{ ID: id, SlaveId: status.SlaveId.GetValue(), } } if !task.IsRunning() && *status.State == mesos.TaskState_TASK_RUNNING { TasksRunning.Inc() } if types.IsTerminal(status.State) { TasksTerminated.With(prometheus.Labels{"status": status.State.String()}).Inc() if task.WasRunning() { TasksRunning.Dec() } } task.UpdateStatus(types.Status{ Status: status.State.String(), Time: time.Now().Unix(), }) if *status.State == mesos.TaskState_TASK_FAILED && !task.WasRunning() { if task.Retry >= maxRetries { log.Warnf("giving up on %s after %d retry attempts", id, task.Retry) } else { log.Infof("task %s was never running. re-scheduling", id) task.UpdateStatus(types.Status{ Status: mesos.TaskState_TASK_STAGING.String(), Time: time.Now().Unix(), }) task.Retry++ go func() { QueueSize.Inc() s.tasks <- id }() } } if types.IsTerminal(status.State) { handler.NotifyCallback(&task) } database.PutTask(&task) }
// ResourceOffers handles the Resource Offers func (s *eremeticScheduler) ResourceOffers(driver sched.SchedulerDriver, offers []*mesos.Offer) { log.Tracef("Received %d resource offers", len(offers)) for _, offer := range offers { select { case <-s.shutdown: log.Infof("Shutting down: declining offer on [%s]", offer.Hostname) driver.DeclineOffer(offer.Id, defaultFilter) continue case tid := <-s.tasks: log.Debugf("Preparing to launch task %s with offer %s", tid, offer.Id.GetValue()) t, _ := database.ReadTask(tid) task := s.newTask(offer, &t) database.PutTask(&t) driver.LaunchTasks([]*mesos.OfferID{offer.Id}, []*mesos.TaskInfo{task}, defaultFilter) continue default: } log.Trace("No tasks to launch. Declining offer.") driver.DeclineOffer(offer.Id, defaultFilter) } }
func ReconcileTasks(driver sched.SchedulerDriver) *Reconcile { cancel := make(chan struct{}) done := make(chan struct{}) go func() { var ( c uint delay int ) tasks, err := database.ListNonTerminalTasks() if err != nil { log.Errorf("Failed to list non-terminal tasks: %s", err) close(done) return } log.Infof("Trying to reconcile with %d task(s)", len(tasks)) start := time.Now() for len(tasks) > 0 { select { case <-cancel: log.Info("Cancelling reconciliation job") close(done) return case <-time.After(time.Duration(delay) * time.Second): // Filter tasks that has received a status update ntasks := []*types.EremeticTask{} for _, t := range tasks { nt, err := database.ReadTask(t.ID) if err != nil { log.Warnf("Task %s not found in database", t.ID) continue } if nt.LastUpdated().Before(start) { ntasks = append(ntasks, &nt) } } tasks = ntasks // Send reconciliation request if len(tasks) > 0 { var statuses []*mesos.TaskStatus for _, t := range tasks { statuses = append(statuses, &mesos.TaskStatus{ State: mesos.TaskState_TASK_STAGING.Enum(), TaskId: &mesos.TaskID{Value: proto.String(t.ID)}, SlaveId: &mesos.SlaveID{Value: proto.String(t.SlaveId)}, }) } log.Debugf("Sending reconciliation request #%d", c) driver.ReconcileTasks(statuses) } if delay < maxReconciliationDelay { delay = 10 << c if delay >= maxReconciliationDelay { delay = maxReconciliationDelay } } c += 1 } } log.Info("Reconciliation done") close(done) }() return &Reconcile{ cancel: cancel, done: done, } }