func (sched *SdcScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ // KillTaskを実行するとTASK_LOSTが検知され、フレームワークが止まる // driver.KillTask(status.TaskId) // log.Infoln("!! Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) // return } if sched.tasksFinished >= sched.totalTasks { // log.Infoln("Total tasks completed, stopping framework.") log.Infoln("Total tasks completed.") sched.tasksFinished = 0 sched.totalTasks = 0 sched.tasksLaunched = 0 // driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_ERROR { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } }
// perform one-time initialization actions upon the first registration event received from Mesos. func (k *framework) onInitialRegistration(driver bindings.SchedulerDriver) { defer close(k.registration) if k.failoverTimeout > 0 { refreshInterval := k.schedulerConfig.FrameworkIdRefreshInterval.Duration if k.failoverTimeout < k.schedulerConfig.FrameworkIdRefreshInterval.Duration.Seconds() { refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second } // wait until we've written the framework ID at least once before proceeding firstStore := make(chan struct{}) go runtime.Until(func() { // only close firstStore once select { case <-firstStore: default: defer close(firstStore) } err := k.storeFrameworkId(context.TODO(), k.frameworkId.GetValue()) if err != nil { log.Errorf("failed to store framework ID: %v", err) if err == frameworkid.ErrMismatch { // we detected a framework ID in storage that doesn't match what we're trying // to save. this is a dangerous state: // (1) perhaps we failed to initially recover the framework ID and so mesos // issued us a new one. now that we're trying to save it there's a mismatch. // (2) we've somehow bungled the framework ID and we're out of alignment with // what mesos is expecting. // (3) multiple schedulers were launched at the same time, and both have // registered with mesos (because when they each checked, there was no ID in // storage, so they asked for a new one). one of them has already written the // ID to storage -- we lose. log.Error("aborting due to framework ID mismatch") driver.Abort() } } }, refreshInterval, k.terminate) // wait for the first store attempt of the framework ID select { case <-firstStore: case <-k.terminate: } } r1 := k.makeTaskRegistryReconciler() r2 := k.makePodRegistryReconciler() k.tasksReconciler = taskreconciler.New(k.asRegisteredMaster, taskreconciler.MakeComposite(k.terminate, r1, r2), k.reconcileCooldown, k.schedulerConfig.ExplicitReconciliationAbortTimeout.Duration, k.terminate) go k.tasksReconciler.Run(driver, k.terminate) if k.reconcileInterval > 0 { ri := time.Duration(k.reconcileInterval) * time.Second time.AfterFunc(k.schedulerConfig.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.tasksReconciler.RequestImplicit, ri, k.terminate) }) log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedulerConfig.InitialImplicitReconciliationDelay.Duration) } k.installDebugHandlers(k.mux) }
func (sched *NoneScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { taskId := status.GetTaskId().GetValue() log.Infoln("Status update: task", taskId, "is in state", status.State.Enum().String()) c := sched.queue.GetCommandById(taskId) if c == nil { log.Errorln("Unable to find command for task", taskId) driver.Abort() } if c.Status.GetState() == status.GetState() { // ignore repeated status updates return } c.Status = status // send status update to CommandHandler if status.GetState() == mesos.TaskState_TASK_RUNNING { sched.handler.CommandRunning(c) } else if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.handler.CommandEnded(c) sched.handler.CommandFinished(c) } else if status.GetState() == mesos.TaskState_TASK_FAILED || status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED { sched.handler.CommandEnded(c) sched.handler.CommandFailed(c) } // stop if Commands channel was closed and all tasks are finished if sched.queue.Closed() && !sched.handler.HasRunningTasks() { log.Infoln("All tasks finished, stopping framework.") sched.handler.FinishAllCommands() driver.Stop(false) } }
func (sched *ScraperScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ log.Infof("%v of %v tasks finished.", sched.tasksFinished, sched.totalTasks) uri := string(status.Data) db, err := sql.Open("sqlite3", "./database.db") if err != nil { log.Fatal("Failed to connect to database: %v\n", err) } defer db.Close() tx, err := db.Begin() if err != nil { log.Fatal(err) } stmt, err := tx.Prepare("insert into runs(uri, storage_path, last_scrape_time) values(?, ?, ?)") if err != nil { log.Fatal(err) } path := base64.StdEncoding.EncodeToString([]byte(uri)) _, err = stmt.Exec(uri, path, time.Now().Unix()) if err != nil { log.Fatal(err) } defer stmt.Close() tx.Commit() } if sched.tasksFinished >= sched.totalTasks { log.Infoln("Tasks that we know about are done!") } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } }
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) { log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String()) if status.GetState() == mesos.TaskState_TASK_FINISHED { sched.tasksFinished++ } if sched.tasksFinished >= sched.totalTasks { log.Infoln("Total tasks completed, stopping framework.") driver.Stop(false) } if status.GetState() == mesos.TaskState_TASK_LOST || status.GetState() == mesos.TaskState_TASK_KILLED || status.GetState() == mesos.TaskState_TASK_FAILED { log.Infoln( "Aborting because task", status.TaskId.GetValue(), "is in unexpected state", status.State.String(), "with message", status.GetMessage(), ) driver.Abort() } }
func (s *EtcdScheduler) reseedCluster(driver scheduler.SchedulerDriver) { // This CAS allows us to: // 1. ensure non-concurrent execution // 2. signal to shouldLaunch that we're already reseeding if !atomic.CompareAndSwapInt32(&s.reseeding, notReseeding, reseedUnderway) { return } atomic.AddUint32(&s.Stats.ClusterReseeds, 1) s.mut.Lock() s.state = Immutable defer func() { s.state = Mutable atomic.StoreInt32(&s.reseeding, notReseeding) s.mut.Unlock() }() candidates := rpc.RankReseedCandidates(s.running) if len(candidates) == 0 { log.Error("Failed to retrieve any candidates for reseeding! " + "No recovery possible!") driver.Abort() } killable := []string{} newSeed := "" log.Infof("Candidates for reseed: %+v", candidates) for _, node := range candidates { // 1. restart node with --force-new-cluster // 2. ensure it passes health check // 3. ensure its member list only contains itself // 4. kill everybody else if newSeed != "" { log.Warningf("Marking node %s from previous cluster as inferior", node.Node) killable = append(killable, node.Node) } else { log.Warningf("Attempting to re-seed cluster with candidate %s "+ "with Raft index %d!", node.Node, node.RaftIndex) if s.reseedNode(node.Node, driver) { newSeed = node.Node continue } // Mark this node as killable, as it did not become healthy on time. log.Errorf("Failed reseed attempt on node %s, trying the next-best node.", node.Node) log.Warningf("Marking node %s from previous cluster as inferior", node.Node) killable = append(killable, node.Node) } } if newSeed != "" { log.Warningf("We think we have a new healthy leader: %s", newSeed) log.Warning("Terminating stale members of previous cluster.") for node, taskID := range s.tasks { if node != newSeed { log.Warningf("Killing old node %s", node) driver.KillTask(taskID) } } } }