// QueuePendingTasks watches PENDING Job Store, generates appropriate Tasks and queues them into Pending Tasks queue // // QueuePendingTasks runs in a separate goroutine started in Worker.Start call // It returns error if it can't read Job Store or if the goroutine it's running in has been stopped. func (bw *BasicWorker) QueuePendingTasks() error { state := taurus.PENDING queue := Pending errChan := make(chan error) ticker := time.NewTicker(StoreScanTick) go func() { var qpErr error queuer: for { select { case <-bw.done: ticker.Stop() qpErr = nil log.Printf("Finishing %s Task queuer", state) break queuer case <-ticker.C: jobs, err := bw.store.GetJobs(state) if err != nil { qpErr = fmt.Errorf("Error reading new Jobs: %s", err) break queuer } for _, job := range jobs { ctx, cancel := context.WithTimeout(context.Background(), MasterTimeout) launchedTasks, err := taurus.MesosTasks(ctx, bw.master, job.Id, nil) log.Printf("Job %s has %d launched tasks", job.Id, len(launchedTasks)) if err != nil { log.Printf("Failed to retrieve Tasks for Job %s: %s", job.Id, err) cancel() continue } for _, jobTask := range job.Tasks { for i := uint32(0); i < jobTask.Replicas-uint32(len(launchedTasks)); i++ { taskInfo := taurus.CreateMesosTaskInfo(job.Id, jobTask) task := &taurus.Task{ Info: taskInfo, JobId: job.Id, } taskId := taskInfo.TaskId.GetValue() log.Printf("Queueing task: %s", taskId) if err := bw.queue.Publish(queue, task); err != nil { log.Printf("Failed to queue %s: %s", taskId, err) continue } } } } } } errChan <- qpErr log.Printf("%s tasks queuer ticker stopped", state) }() return <-errChan }
// ReconcilePendingJobs monitors launched Tasks of each PENDING Job and marks the Job as RUNNING // if all of the Job tasks have been attempted to launch // // ReconcilePendingJobs runs in a separate goroutine started in Worker.Start call // It returns error if it can't read the Job Store or the gourine it is running in has been stopped func (bw *BasicWorker) ReconcilePendingJobs() error { oldState := taurus.PENDING newState := taurus.RUNNING errChan := make(chan error) ticker := time.NewTicker(ReconcileScanTick) go func() { var reconErr error reconciler: for { select { case <-bw.done: log.Printf("Finished %s Reconciler", oldState) ticker.Stop() reconErr = nil break reconciler case <-ticker.C: jobs, err := bw.store.GetJobs(oldState) if err != nil { reconErr = fmt.Errorf("Error reading %s Jobs: %s", oldState, err) break reconciler } for _, job := range jobs { ctx, cancel := context.WithTimeout(context.Background(), MasterTimeout) launchedTasks, err := taurus.MesosTasks(ctx, bw.master, job.Id, nil) log.Printf("Job %s has %d launched tasks", job.Id, len(launchedTasks)) if err != nil { log.Printf("Failed to retrieve Tasks for Job %s: %s", job.Id, err) cancel() continue } jobTaskCount := uint32(0) for _, jobTask := range job.Tasks { jobTaskCount += jobTask.Replicas } if uint32(len(launchedTasks)) == jobTaskCount { job.State = newState if err := bw.store.UpdateJob(job); err != nil { reconErr = fmt.Errorf("Failed to update job %s: %s", job.Id, err) break reconciler } log.Printf("Job %s marked as %s", job.Id, newState) } } } } errChan <- reconErr log.Printf("%s Task Reconciler tick stopped", oldState) }() return <-errChan }
// KillJobTasks monitors all Jobs marked as STOPPED and kills all of their running Tasks // // KillJobTasks runs in a separate goroutine started in Worker.Start call // It returns error if it can't read Job Store or the goroutine it is running in has been stopped func (bw *BasicWorker) KillJobTasks(driver scheduler.SchedulerDriver) error { state := taurus.STOPPED errChan := make(chan error) ticker := time.NewTicker(StoreScanTick) go func() { var killErr error killer: for { select { case <-bw.done: ticker.Stop() killErr = nil log.Printf("Finishing %s Task queuer", state) break killer case <-ticker.C: jobs, err := bw.store.GetJobs(state) if err != nil { killErr = fmt.Errorf("Error reading %s Jobs: %s", state, err) break killer } for _, job := range jobs { ctx, cancel := context.WithTimeout(context.Background(), MasterTimeout) mesosTasks, err := taurus.MesosTasks(ctx, bw.master, job.Id, mesos.TaskState_TASK_RUNNING.Enum()) if err != nil { log.Printf("Failed to read tasks for Job %s: %s", job.Id, err) cancel() continue } for taskId, _ := range mesosTasks { mesosTaskId := mesosutil.NewTaskID(taskId) killStatus, err := driver.KillTask(mesosTaskId) if err != nil { log.Printf("Mesos in state %s failed to kill the task %s: %s", killStatus, taskId, err) continue } } } } } errChan <- killErr log.Printf("%s tasks killer ticker stopped", state) }() return <-errChan }