Beispiel #1
0
func (r *Reconciler) reconcile(driver scheduler.SchedulerDriver, implicit bool) {
	if time.Now().Sub(r.reconcileTime) >= r.ReconcileDelay {
		r.taskLock.Lock()
		defer r.taskLock.Unlock()

		r.reconciles++
		r.reconcileTime = time.Now()

		if r.reconciles > r.ReconcileMaxTries {
			for task := range r.tasks {
				Logger.Info("Reconciling exceeded %d tries, sending killTask for task %s", r.ReconcileMaxTries, task)
				driver.KillTask(util.NewTaskID(task))
			}
			r.reconciles = 0
		} else {
			if implicit {
				driver.ReconcileTasks(nil)
			} else {
				statuses := make([]*mesos.TaskStatus, 0)
				for task := range r.tasks {
					Logger.Debug("Reconciling %d/%d task state for task id %s", r.reconciles, r.ReconcileMaxTries, task)
					statuses = append(statuses, util.NewTaskStatus(util.NewTaskID(task), mesos.TaskState_TASK_STAGING))
				}
				driver.ReconcileTasks(statuses)
			}
		}
	}
}
Beispiel #2
0
func (sched *ExampleScheduler) Reregistered(driver sched.SchedulerDriver, masterInfo *mesos.MasterInfo) {
	log.Infoln("Framework Re-Registered with Master ", masterInfo)
	_, err := driver.ReconcileTasks([]*mesos.TaskStatus{})
	if err != nil {
		log.Errorf("failed to request task reconciliation: %v", err)
	}
}
Beispiel #3
0
// Reregistered is called when the Scheduler is Reregistered
func (s *eremeticScheduler) Reregistered(driver sched.SchedulerDriver, masterInfo *mesos.MasterInfo) {
	log.Debugf("Framework re-registered with master %s", masterInfo)
	if !s.initialised {
		driver.ReconcileTasks([]*mesos.TaskStatus{})
		s.initialised = true
	} else {
		s.Reconcile(driver)
	}
}
Beispiel #4
0
// Registered is called when the Scheduler is Registered
func (s *eremeticScheduler) Registered(driver sched.SchedulerDriver, frameworkID *mesos.FrameworkID, masterInfo *mesos.MasterInfo) {
	log.Debugf("Framework %s registered with master %s", frameworkID.GetValue(), masterInfo.GetHostname())
	if !s.initialised {
		driver.ReconcileTasks([]*mesos.TaskStatus{})
		s.initialised = true
	} else {
		s.Reconcile(driver)
	}
}
Beispiel #5
0
// Reregistered is called when the Scheduler is Reregistered
func (s *Scheduler) Reregistered(driver mesossched.SchedulerDriver, masterInfo *mesosproto.MasterInfo) {
	logrus.WithFields(logrus.Fields{
		"master_id": masterInfo.GetId(),
		"master":    masterInfo.GetHostname(),
	}).Debug("Framework re-registered with master.")
	if !s.initialised {
		driver.ReconcileTasks([]*mesosproto.TaskStatus{})
		s.initialised = true
	} else {
		s.Reconcile(driver)
	}
}
Beispiel #6
0
// execute an explicit task reconciliation, as per http://mesos.apache.org/documentation/latest/reconciliation/
func (k *KubernetesScheduler) explicitlyReconcileTasks(driver bindings.SchedulerDriver, taskToSlave map[string]string, cancel <-chan struct{}) error {
	log.Info("explicit reconcile tasks")

	// tell mesos to send us the latest status updates for all the non-terminal tasks that we know about
	statusList := []*mesos.TaskStatus{}
	remaining := sets.KeySet(reflect.ValueOf(taskToSlave))
	for taskId, slaveId := range taskToSlave {
		if slaveId == "" {
			delete(taskToSlave, taskId)
			continue
		}
		statusList = append(statusList, &mesos.TaskStatus{
			TaskId:  mutil.NewTaskID(taskId),
			SlaveId: mutil.NewSlaveID(slaveId),
			State:   mesos.TaskState_TASK_RUNNING.Enum(), // req'd field, doesn't have to reflect reality
		})
	}

	select {
	case <-cancel:
		return reconciliationCancelledErr
	default:
		if _, err := driver.ReconcileTasks(statusList); err != nil {
			return err
		}
	}

	start := time.Now()
	first := true
	for backoff := 1 * time.Second; first || remaining.Len() > 0; backoff = backoff * 2 {
		first = false
		// nothing to do here other than wait for status updates..
		if backoff > k.schedcfg.ExplicitReconciliationMaxBackoff.Duration {
			backoff = k.schedcfg.ExplicitReconciliationMaxBackoff.Duration
		}
		select {
		case <-cancel:
			return reconciliationCancelledErr
		case <-time.After(backoff):
			for taskId := range remaining {
				if task, _ := k.taskRegistry.Get(taskId); task != nil && explicitTaskFilter(task) && task.UpdatedTime.Before(start) {
					// keep this task in remaining list
					continue
				}
				remaining.Delete(taskId)
			}
		}
	}
	return nil
}
Beispiel #7
0
// Perform implicit reconciliation every 5 minutes
func (s *EtcdScheduler) PeriodicReconciler(driver scheduler.SchedulerDriver) {
	for {
		s.mut.RLock()
		state := s.state
		s.mut.RUnlock()
		if state == Mutable {
			_, err := driver.ReconcileTasks([]*mesos.TaskStatus{})
			if err != nil {
				log.Errorf("Error while calling ReconcileTasks: %s", err)
			}
		}
		time.Sleep(5 * time.Minute)
	}
}
func (s *MinerScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
	log.Infoln("Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String())
	// If the mining server failed, kill all daemons, since they will be trying to talk to the failed mining server
	if strings.Contains(status.GetTaskId().GetValue(), "server") {
		s.minerServerRunning = false

		// kill all tasks
		statuses := make([]*mesos.TaskStatus, 0)
		_, err := driver.ReconcileTasks(statuses)
		if err != nil {
			panic(err)
		}

		for _, status := range statuses {
			driver.KillTask(status.TaskId)
		}
	}
}
func (r *Reconciler) reconcile(driver scheduler.SchedulerDriver, implicit bool) {
	if time.Now().Sub(r.reconcileTime) >= r.ReconcileDelay {
		if !r.tasks.IsReconciling() {
			r.reconciles = 0
		}
		r.reconciles++
		r.reconcileTime = time.Now()

		if r.reconciles > r.ReconcileMaxTries {
			for _, task := range r.tasks.GetWithFilter(func(task Task) bool {
				return task.Data().State == TaskStateReconciling
			}) {
				if task.Data().TaskID != "" {
					Logger.Info("Reconciling exceeded %d tries for task %s, sending killTask for task %s", r.ReconcileMaxTries, task.Data().ID, task.Data().TaskID)
					driver.KillTask(util.NewTaskID(task.Data().TaskID))

					task.Data().Reset()
				}
			}
		} else {
			if implicit {
				driver.ReconcileTasks(nil)
			} else {
				statuses := make([]*mesos.TaskStatus, 0)
				for _, task := range r.tasks.GetAll() {
					if task.Data().TaskID != "" {
						task.Data().State = TaskStateReconciling
						Logger.Info("Reconciling %d/%d task state for id %s, task id %s", r.reconciles, r.ReconcileMaxTries, task.Data().ID, task.Data().TaskID)
						statuses = append(statuses, util.NewTaskStatus(util.NewTaskID(task.Data().TaskID), mesos.TaskState_TASK_STAGING))
					}
				}
				driver.ReconcileTasks(statuses)
			}
		}
	}
}
Beispiel #10
0
// execute task reconciliation, returns when r.done is closed. intended to run as a goroutine.
// if reconciliation is requested while another is in progress, the in-progress operation will be
// cancelled before the new reconciliation operation begins.
func (r *Reconciler) Run(driver bindings.SchedulerDriver) {
	var cancel, finished chan struct{}
requestLoop:
	for {
		select {
		case <-r.done:
			return
		default: // proceed
		}
		select {
		case <-r.implicit:
			metrics.ReconciliationRequested.WithLabelValues("implicit").Inc()
			select {
			case <-r.done:
				return
			case <-r.explicit:
				break // give preference to a pending request for explicit
			default: // continue
				// don't run implicit reconciliation while explicit is ongoing
				if finished != nil {
					select {
					case <-finished: // continue w/ implicit
					default:
						log.Infoln("skipping implicit reconcile because explicit reconcile is ongoing")
						continue requestLoop
					}
				}
				errOnce := proc.NewErrorOnce(r.done)
				errCh := r.Do(func() {
					var err error
					defer errOnce.Report(err)
					log.Infoln("implicit reconcile tasks")
					metrics.ReconciliationExecuted.WithLabelValues("implicit").Inc()
					if _, err = driver.ReconcileTasks([]*mesos.TaskStatus{}); err != nil {
						log.V(1).Infof("failed to request implicit reconciliation from mesos: %v", err)
					}
				})
				proc.OnError(errOnce.Send(errCh).Err(), func(err error) {
					log.Errorf("failed to run implicit reconciliation: %v", err)
				}, r.done)
				goto slowdown
			}
		case <-r.done:
			return
		case <-r.explicit: // continue
			metrics.ReconciliationRequested.WithLabelValues("explicit").Inc()
		}

		if cancel != nil {
			close(cancel)
			cancel = nil

			// play nice and wait for the prior operation to finish, complain
			// if it doesn't
			select {
			case <-r.done:
				return
			case <-finished: // noop, expected
			case <-time.After(r.explicitReconciliationAbortTimeout): // very unexpected
				log.Error("reconciler action failed to stop upon cancellation")
			}
		}
		// copy 'finished' to 'fin' here in case we end up with simultaneous go-routines,
		// if cancellation takes too long or fails - we don't want to close the same chan
		// more than once
		cancel = make(chan struct{})
		finished = make(chan struct{})
		go func(fin chan struct{}) {
			startedAt := time.Now()
			defer func() {
				metrics.ReconciliationLatency.Observe(metrics.InMicroseconds(time.Since(startedAt)))
			}()

			metrics.ReconciliationExecuted.WithLabelValues("explicit").Inc()
			defer close(fin)
			err := <-r.Action(driver, cancel)
			if err == reconciliationCancelledErr {
				metrics.ReconciliationCancelled.WithLabelValues("explicit").Inc()
				log.Infoln(err.Error())
			} else if err != nil {
				log.Errorf("reconciler action failed: %v", err)
			}
		}(finished)
	slowdown:
		// don't allow reconciliation to run very frequently, either explicit or implicit
		select {
		case <-r.done:
			return
		case <-time.After(r.cooldown): // noop
		}
	} // for
}
Beispiel #11
0
func ReconcileTasks(driver sched.SchedulerDriver) *Reconcile {
	cancel := make(chan struct{})
	done := make(chan struct{})

	go func() {
		var (
			c     uint
			delay int
		)

		tasks, err := database.ListNonTerminalTasks()
		if err != nil {
			log.Errorf("Failed to list non-terminal tasks: %s", err)
			close(done)
			return
		}

		log.Infof("Trying to reconcile with %d task(s)", len(tasks))
		start := time.Now()

		for len(tasks) > 0 {
			select {
			case <-cancel:
				log.Info("Cancelling reconciliation job")
				close(done)
				return
			case <-time.After(time.Duration(delay) * time.Second):
				// Filter tasks that has received a status update
				ntasks := []*types.EremeticTask{}
				for _, t := range tasks {
					nt, err := database.ReadTask(t.ID)
					if err != nil {
						log.Warnf("Task %s not found in database", t.ID)
						continue
					}
					if nt.LastUpdated().Before(start) {
						ntasks = append(ntasks, &nt)
					}
				}
				tasks = ntasks

				// Send reconciliation request
				if len(tasks) > 0 {
					var statuses []*mesos.TaskStatus
					for _, t := range tasks {
						statuses = append(statuses, &mesos.TaskStatus{
							State:   mesos.TaskState_TASK_STAGING.Enum(),
							TaskId:  &mesos.TaskID{Value: proto.String(t.ID)},
							SlaveId: &mesos.SlaveID{Value: proto.String(t.SlaveId)},
						})
					}
					log.Debugf("Sending reconciliation request #%d", c)
					driver.ReconcileTasks(statuses)
				}

				if delay < maxReconciliationDelay {
					delay = 10 << c
					if delay >= maxReconciliationDelay {
						delay = maxReconciliationDelay
					}
				}

				c += 1
			}
		}

		log.Info("Reconciliation done")
		close(done)
	}()

	return &Reconcile{
		cancel: cancel,
		done:   done,
	}
}
Beispiel #12
0
func (s *EtcdScheduler) attemptMasterSync(driver scheduler.SchedulerDriver) {
	// Request that the master send us TaskStatus for live tasks.

	backoff := 1
	for retries := 0; retries < 5; retries++ {
		previousReconciliationInfo, err := s.reconciliationInfoFunc(
			s.ZkServers,
			s.ZkChroot,
			s.FrameworkName,
		)
		if err == nil {
			s.mut.Lock()
			s.reconciliationInfo = previousReconciliationInfo
			s.mut.Unlock()

			statuses := []*mesos.TaskStatus{}
			for taskID, slaveID := range previousReconciliationInfo {
				statuses = append(statuses, &mesos.TaskStatus{
					SlaveId: util.NewSlaveID(slaveID),
					TaskId:  util.NewTaskID(taskID),
					State:   mesos.TaskState_TASK_RUNNING.Enum(),
				})
			}

			// Here we do both implicit and explicit task reconciliation
			// in the off-chance that we were unable to persist a running
			// task in ZK after it started.
			_, err = driver.ReconcileTasks([]*mesos.TaskStatus{})
			if err != nil {
				log.Errorf("Error while calling ReconcileTasks: %s", err)
				continue
			}

			_, err = driver.ReconcileTasks(statuses)
			if err != nil {
				log.Errorf("Error while calling ReconcileTasks: %s", err)
			} else {
				// We want to allow some time for reconciled updates to arrive.
				err := s.waitForMasterSync()
				if err != nil {
					log.Error(err)
				} else {
					s.mut.Lock()
					log.Info("Scheduler transitioning to Mutable state.")
					s.state = Mutable
					s.mut.Unlock()
					return
				}
			}
		}
		log.Error(err)
		time.Sleep(time.Duration(backoff) * time.Second)
		backoff = int(math.Min(float64(backoff<<1), 8))
	}
	log.Error("Failed to synchronize with master!  " +
		"It is dangerous to continue at this point.  Dying.")
	if s.shutdown != nil {
		s.shutdown()
	}

}