Exemplo n.º 1
0
func (sched *SdcScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())

	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
		// KillTaskを実行するとTASK_LOSTが検知され、フレームワークが止まる
		// driver.KillTask(status.TaskId)
		// log.Infoln("!! Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
		// return
	}

	if sched.tasksFinished >= sched.totalTasks {
		// log.Infoln("Total tasks completed, stopping framework.")
		log.Infoln("Total tasks completed.")
		sched.tasksFinished = 0
		sched.totalTasks = 0
		sched.tasksLaunched = 0
		// driver.Stop(false)
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_ERROR {
		log.Infoln(
			"Aborting because task", status.TaskId.GetValue(),
			"is in unexpected state", status.State.String(),
			"with message", status.GetMessage(),
		)
		driver.Abort()
	}
}
Exemplo n.º 2
0
// perform one-time initialization actions upon the first registration event received from Mesos.
func (k *framework) onInitialRegistration(driver bindings.SchedulerDriver) {
	defer close(k.registration)

	if k.failoverTimeout > 0 {
		refreshInterval := k.schedulerConfig.FrameworkIdRefreshInterval.Duration
		if k.failoverTimeout < k.schedulerConfig.FrameworkIdRefreshInterval.Duration.Seconds() {
			refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second
		}

		// wait until we've written the framework ID at least once before proceeding
		firstStore := make(chan struct{})
		go runtime.Until(func() {
			// only close firstStore once
			select {
			case <-firstStore:
			default:
				defer close(firstStore)
			}
			err := k.storeFrameworkId(context.TODO(), k.frameworkId.GetValue())
			if err != nil {
				log.Errorf("failed to store framework ID: %v", err)
				if err == frameworkid.ErrMismatch {
					// we detected a framework ID in storage that doesn't match what we're trying
					// to save. this is a dangerous state:
					// (1) perhaps we failed to initially recover the framework ID and so mesos
					// issued us a new one. now that we're trying to save it there's a mismatch.
					// (2) we've somehow bungled the framework ID and we're out of alignment with
					// what mesos is expecting.
					// (3) multiple schedulers were launched at the same time, and both have
					// registered with mesos (because when they each checked, there was no ID in
					// storage, so they asked for a new one). one of them has already written the
					// ID to storage -- we lose.
					log.Error("aborting due to framework ID mismatch")
					driver.Abort()
				}
			}
		}, refreshInterval, k.terminate)

		// wait for the first store attempt of the framework ID
		select {
		case <-firstStore:
		case <-k.terminate:
		}
	}

	r1 := k.makeTaskRegistryReconciler()
	r2 := k.makePodRegistryReconciler()

	k.tasksReconciler = taskreconciler.New(k.asRegisteredMaster, taskreconciler.MakeComposite(k.terminate, r1, r2),
		k.reconcileCooldown, k.schedulerConfig.ExplicitReconciliationAbortTimeout.Duration, k.terminate)
	go k.tasksReconciler.Run(driver, k.terminate)

	if k.reconcileInterval > 0 {
		ri := time.Duration(k.reconcileInterval) * time.Second
		time.AfterFunc(k.schedulerConfig.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.tasksReconciler.RequestImplicit, ri, k.terminate) })
		log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedulerConfig.InitialImplicitReconciliationDelay.Duration)
	}

	k.installDebugHandlers(k.mux)
}
Exemplo n.º 3
0
func (sched *NoneScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	taskId := status.GetTaskId().GetValue()
	log.Infoln("Status update: task", taskId, "is in state", status.State.Enum().String())

	c := sched.queue.GetCommandById(taskId)
	if c == nil {
		log.Errorln("Unable to find command for task", taskId)
		driver.Abort()
	}
	if c.Status.GetState() == status.GetState() {
		// ignore repeated status updates
		return
	}
	c.Status = status

	// send status update to CommandHandler
	if status.GetState() == mesos.TaskState_TASK_RUNNING {
		sched.handler.CommandRunning(c)
	} else if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.handler.CommandEnded(c)
		sched.handler.CommandFinished(c)
	} else if status.GetState() == mesos.TaskState_TASK_FAILED ||
		status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED {
		sched.handler.CommandEnded(c)
		sched.handler.CommandFailed(c)
	}

	// stop if Commands channel was closed and all tasks are finished
	if sched.queue.Closed() && !sched.handler.HasRunningTasks() {
		log.Infoln("All tasks finished, stopping framework.")
		sched.handler.FinishAllCommands()
		driver.Stop(false)
	}
}
Exemplo n.º 4
0
func (sched *ScraperScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())

	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
		log.Infof("%v of %v tasks finished.", sched.tasksFinished, sched.totalTasks)
		uri := string(status.Data)

		db, err := sql.Open("sqlite3", "./database.db")
		if err != nil {
			log.Fatal("Failed to connect to database: %v\n", err)
		}
		defer db.Close()
		tx, err := db.Begin()
		if err != nil {
			log.Fatal(err)
		}
		stmt, err := tx.Prepare("insert into runs(uri, storage_path, last_scrape_time) values(?, ?, ?)")
		if err != nil {
			log.Fatal(err)
		}

		path := base64.StdEncoding.EncodeToString([]byte(uri))
		_, err = stmt.Exec(uri, path, time.Now().Unix())
		if err != nil {
			log.Fatal(err)
		}
		defer stmt.Close()
		tx.Commit()
	}

	if sched.tasksFinished >= sched.totalTasks {
		log.Infoln("Tasks that we know about are done!")
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED {
		log.Infoln(
			"Aborting because task", status.TaskId.GetValue(),
			"is in unexpected state", status.State.String(),
			"with message", status.GetMessage(),
		)
		driver.Abort()
	}
}
Exemplo n.º 5
0
func (sched *ExampleScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
	if status.GetState() == mesos.TaskState_TASK_FINISHED {
		sched.tasksFinished++
	}

	if sched.tasksFinished >= sched.totalTasks {
		log.Infoln("Total tasks completed, stopping framework.")
		driver.Stop(false)
	}

	if status.GetState() == mesos.TaskState_TASK_LOST ||
		status.GetState() == mesos.TaskState_TASK_KILLED ||
		status.GetState() == mesos.TaskState_TASK_FAILED {
		log.Infoln(
			"Aborting because task", status.TaskId.GetValue(),
			"is in unexpected state", status.State.String(),
			"with message", status.GetMessage(),
		)
		driver.Abort()
	}
}
Exemplo n.º 6
0
func (s *EtcdScheduler) reseedCluster(driver scheduler.SchedulerDriver) {
	// This CAS allows us to:
	//	1. ensure non-concurrent execution
	//	2. signal to shouldLaunch that we're already reseeding
	if !atomic.CompareAndSwapInt32(&s.reseeding, notReseeding, reseedUnderway) {
		return
	}
	atomic.AddUint32(&s.Stats.ClusterReseeds, 1)

	s.mut.Lock()
	s.state = Immutable

	defer func() {
		s.state = Mutable
		atomic.StoreInt32(&s.reseeding, notReseeding)
		s.mut.Unlock()
	}()

	candidates := rpc.RankReseedCandidates(s.running)
	if len(candidates) == 0 {
		log.Error("Failed to retrieve any candidates for reseeding! " +
			"No recovery possible!")
		driver.Abort()
	}

	killable := []string{}
	newSeed := ""
	log.Infof("Candidates for reseed: %+v", candidates)
	for _, node := range candidates {
		// 1. restart node with --force-new-cluster
		// 2. ensure it passes health check
		// 3. ensure its member list only contains itself
		// 4. kill everybody else
		if newSeed != "" {
			log.Warningf("Marking node %s from previous cluster as inferior", node.Node)
			killable = append(killable, node.Node)
		} else {
			log.Warningf("Attempting to re-seed cluster with candidate %s "+
				"with Raft index %d!", node.Node, node.RaftIndex)
			if s.reseedNode(node.Node, driver) {
				newSeed = node.Node
				continue
			}
			// Mark this node as killable, as it did not become healthy on time.
			log.Errorf("Failed reseed attempt on node %s, trying the next-best node.",
				node.Node)
			log.Warningf("Marking node %s from previous cluster as inferior", node.Node)
			killable = append(killable, node.Node)
		}
	}
	if newSeed != "" {
		log.Warningf("We think we have a new healthy leader: %s", newSeed)
		log.Warning("Terminating stale members of previous cluster.")
		for node, taskID := range s.tasks {
			if node != newSeed {
				log.Warningf("Killing old node %s", node)
				driver.KillTask(taskID)
			}
		}
	}
}