Exemple #1
0
// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler
func (k *errorHandler) handleSchedulingError(pod *api.Pod, schedulingErr error) {

	if schedulingErr == noSuchPodErr {
		log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name)
		return
	}

	log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr)
	defer util.HandleCrash()

	// default upstream scheduler passes pod.Name as binding.PodID
	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
	podKey, err := podtask.MakePodKey(ctx, pod.Name)
	if err != nil {
		log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err)
		return
	}

	k.backoff.GC()
	k.api.Lock()
	defer k.api.Unlock()

	switch task, state := k.api.tasks().ForPod(podKey); state {
	case podtask.StateUnknown:
		// if we don't have a mapping here any more then someone deleted the pod
		log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey)
		return

	case podtask.StatePending:
		if task.Has(podtask.Launched) {
			log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey)
			return
		}
		breakoutEarly := queue.BreakChan(nil)
		if schedulingErr == noSuitableOffersErr {
			log.V(3).Infof("adding backoff breakout handler for pod %v", podKey)
			breakoutEarly = queue.BreakChan(k.api.offers().Listen(podKey, func(offer *mesos.Offer) bool {
				k.api.Lock()
				defer k.api.Unlock()
				switch task, state := k.api.tasks().Get(task.ID); state {
				case podtask.StatePending:
					// Assess fitness of pod with the current offer. The scheduler normally
					// "backs off" when it can't find an offer that matches up with a pod.
					// The backoff period for a pod can terminate sooner if an offer becomes
					// available that matches up.
					return !task.Has(podtask.Launched) && k.api.algorithm().FitPredicate()(task, offer, nil)
				default:
					// no point in continuing to check for matching offers
					return true
				}
			}))
		}
		delay := k.backoff.Get(podKey)
		log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
		k.qr.requeue(&Pod{Pod: pod, delay: &delay, notify: breakoutEarly})

	default:
		log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
	}
}
Exemple #2
0
func (k *deleter) deleteOne(pod *Pod) error {
	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
	podKey, err := podtask.MakePodKey(ctx, pod.Name)
	if err != nil {
		return err
	}

	log.V(2).Infof("pod deleted: %v", podKey)

	// order is important here: we want to make sure we have the lock before
	// removing the pod from the scheduling queue. this makes the concurrent
	// execution of scheduler-error-handling and delete-handling easier to
	// reason about.
	k.api.Lock()
	defer k.api.Unlock()

	// prevent the scheduler from attempting to pop this; it's also possible that
	// it's concurrently being scheduled (somewhere between pod scheduling and
	// binding) - if so, then we'll end up removing it from taskRegistry which
	// will abort Bind()ing
	k.qr.dequeue(pod.GetUID())

	switch task, state := k.api.tasks().ForPod(podKey); state {
	case podtask.StateUnknown:
		log.V(2).Infof("Could not resolve pod '%s' to task id", podKey)
		return noSuchPodErr

	// determine if the task has already been launched to mesos, if not then
	// cleanup is easier (unregister) since there's no state to sync
	case podtask.StatePending:
		if !task.Has(podtask.Launched) {
			// we've been invoked in between Schedule() and Bind()
			if task.HasAcceptedOffer() {
				task.Offer.Release()
				task.Reset()
				task.Set(podtask.Deleted)
				//TODO(jdef) probably want better handling here
				if err := k.api.tasks().Update(task); err != nil {
					return err
				}
			}
			k.api.tasks().Unregister(task)
			return nil
		}
		fallthrough

	case podtask.StateRunning:
		// signal to watchers that the related pod is going down
		task.Set(podtask.Deleted)
		if err := k.api.tasks().Update(task); err != nil {
			log.Errorf("failed to update task w/ Deleted status: %v", err)
		}
		return k.api.killTask(task.ID)

	default:
		log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID)
		return noSuchTaskErr
	}
}
Exemple #3
0
// this pod may be out of sync with respect to the API server registry:
//      this pod   |  apiserver registry
//    -------------|----------------------
//      host=.*    |  404           ; pod was deleted
//      host=.*    |  5xx           ; failed to sync, try again later?
//      host=""    |  host=""       ; perhaps no updates to process?
//      host=""    |  host="..."    ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?)
//      host="..." |  host=""       ; pod is no longer scheduled, does it need to be re-queued?
//      host="..." |  host="..."    ; perhaps no updates to process?
//
// TODO(jdef) this needs an integration test
func (s *schedulingPlugin) reconcileTask(t *podtask.T) {
	log.V(1).Infof("reconcile pod %v, assigned to slave %q", t.Pod.Name, t.Spec.AssignedSlave)
	ctx := api.WithNamespace(api.NewDefaultContext(), t.Pod.Namespace)
	pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(t.Pod.Name)
	if err != nil {
		if errors.IsNotFound(err) {
			// attempt to delete
			if err = s.deleter.deleteOne(&Pod{Pod: &t.Pod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr {
				log.Errorf("failed to delete pod: %v: %v", t.Pod.Name, err)
			}
		} else {
			//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
			//For now, drop the pod on the floor
			log.Warning("aborting reconciliation for pod %v: %v", t.Pod.Name, err)
		}
		return
	}

	log.Infof("pod %v scheduled on %q according to apiserver", pod.Name, pod.Spec.NodeName)
	if t.Spec.AssignedSlave != pod.Spec.NodeName {
		if pod.Spec.NodeName == "" {
			// pod is unscheduled.
			// it's possible that we dropped the pod in the scheduler error handler
			// because of task misalignment with the pod (task.Has(podtask.Launched) == true)

			podKey, err := podtask.MakePodKey(ctx, pod.Name)
			if err != nil {
				log.Error(err)
				return
			}

			s.api.Lock()
			defer s.api.Unlock()

			if _, state := s.api.tasks().ForPod(podKey); state != podtask.StateUnknown {
				//TODO(jdef) reconcile the task
				log.Errorf("task already registered for pod %v", pod.Name)
				return
			}

			now := time.Now()
			log.V(3).Infof("reoffering pod %v", podKey)
			s.qr.reoffer(&Pod{
				Pod:      pod,
				deadline: &now,
			})
		} else {
			// pod is scheduled.
			// not sure how this happened behind our backs. attempt to reconstruct
			// at least a partial podtask.T record.
			//TODO(jdef) reconcile the task
			log.Errorf("pod already scheduled: %v", pod.Name)
		}
	} else {
		//TODO(jdef) for now, ignore the fact that the rest of the spec may be different
		//and assume that our knowledge of the pod aligns with that of the apiserver
		log.Error("pod reconciliation does not support updates; not yet implemented")
	}
}
Exemple #4
0
// Schedule implements the Scheduler interface of Kubernetes.
// It returns the selectedMachine's name and error (if there's any).
func (k *kubeScheduler) Schedule(pod *api.Pod, unused algorithm.MinionLister) (string, error) {
	log.Infof("Try to schedule pod %v\n", pod.Name)
	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)

	// default upstream scheduler passes pod.Name as binding.PodID
	podKey, err := podtask.MakePodKey(ctx, pod.Name)
	if err != nil {
		return "", err
	}

	k.api.Lock()
	defer k.api.Unlock()

	switch task, state := k.api.tasks().ForPod(podKey); state {
	case podtask.StateUnknown:
		// There's a bit of a potential race here, a pod could have been yielded() and
		// then before we get *here* it could be deleted.
		// We use meta to index the pod in the store since that's what k8s reflector does.
		podName, err := cache.MetaNamespaceKeyFunc(pod)
		if err != nil {
			log.Warningf("aborting Schedule, unable to understand pod object %+v", pod)
			return "", noSuchPodErr
		}
		if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted {
			// avoid scheduling a pod that's been deleted between yieldPod() and Schedule()
			log.Infof("aborting Schedule, pod has been deleted %+v", pod)
			return "", noSuchPodErr
		}
		return k.doSchedule(k.api.tasks().Register(k.api.createPodTask(ctx, pod)))

	//TODO(jdef) it's possible that the pod state has diverged from what
	//we knew previously, we should probably update the task.Pod state here
	//before proceeding with scheduling
	case podtask.StatePending:
		if pod.UID != task.Pod.UID {
			// we're dealing with a brand new pod spec here, so the old one must have been
			// deleted -- and so our task store is out of sync w/ respect to reality
			//TODO(jdef) reconcile task
			return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name)
		} else if task.Has(podtask.Launched) {
			// task has been marked as "launched" but the pod binding creation may have failed in k8s,
			// but we're going to let someone else handle it, probably the mesos task error handler
			return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID)
		} else {
			return k.doSchedule(task, nil)
		}

	default:
		return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID)
	}
}
Exemple #5
0
// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler
func (k *errorHandler) Error(pod *api.Pod, schedulingErr error) {

	if schedulingErr == errors.NoSuchPodErr {
		log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name)
		return
	}

	log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr)
	defer runtime.HandleCrash()

	// default upstream scheduler passes pod.Name as binding.PodID
	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
	podKey, err := podtask.MakePodKey(ctx, pod.Name)
	if err != nil {
		log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err)
		return
	}

	k.backoff.GC()
	k.sched.Lock()
	defer k.sched.Unlock()

	switch task, state := k.sched.Tasks().ForPod(podKey); state {
	case podtask.StateUnknown:
		// if we don't have a mapping here any more then someone deleted the pod
		log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey)
		return

	case podtask.StatePending:
		if task.Has(podtask.Launched) {
			log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey)
			return
		}
		breakoutEarly := queue.BreakChan(nil)
		if schedulingErr == errors.NoSuitableOffersErr {
			log.V(3).Infof("adding backoff breakout handler for pod %v", podKey)
			breakoutEarly = k.newBreakChan(podKey)
		}
		delay := k.backoff.Get(podKey)
		log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
		k.qr.Requeue(queuer.NewPod(pod, queuer.Delay(delay), queuer.Notify(breakoutEarly)))

	default:
		log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
	}
}
Exemple #6
0
// implements binding.Registry, launches the pod-associated-task in mesos
func (b *binder) Bind(binding *api.Binding) error {

	ctx := api.WithNamespace(api.NewContext(), binding.Namespace)

	// default upstream scheduler passes pod.Name as binding.Name
	podKey, err := podtask.MakePodKey(ctx, binding.Name)
	if err != nil {
		return err
	}

	b.sched.Lock()
	defer b.sched.Unlock()

	switch task, state := b.sched.Tasks().ForPod(podKey); state {
	case podtask.StatePending:
		return b.bind(ctx, binding, task)
	default:
		// in this case it's likely that the pod has been deleted between Schedule
		// and Bind calls
		log.Infof("No pending task for pod %s", podKey)
		return errors.NoSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?!
	}
}
// Test to create the scheduler plugin with the config returned by the scheduler,
// and play through the whole life cycle of the plugin while creating pods, deleting
// and failing them.
func TestPlugin_LifeCycle(t *testing.T) {
	t.Skip("This test is flaky, see #11901")
	assert := &EventAssertions{*assert.New(t)}

	// create a fake pod watch. We use that below to submit new pods to the scheduler
	podListWatch := NewMockPodsListWatch(api.PodList{})

	// create fake apiserver
	testApiServer := NewTestServer(t, api.NamespaceDefault, podListWatch)
	defer testApiServer.server.Close()

	// create executor with some data for static pods if set
	executor := util.NewExecutorInfo(
		util.NewExecutorID("executor-id"),
		util.NewCommandInfo("executor-cmd"),
	)
	executor.Data = []byte{0, 1, 2}

	// create scheduler
	nodeStore := cache.NewStore(cache.MetaNamespaceKeyFunc)
	as := NewAllocationStrategy(
		podtask.DefaultPredicate,
		podtask.NewDefaultProcurement(mresource.DefaultDefaultContainerCPULimit, mresource.DefaultDefaultContainerMemLimit))
	testScheduler := New(Config{
		Executor: executor,
		Client:   client.NewOrDie(&client.Config{Host: testApiServer.server.URL, Version: testapi.Default.Version()}),
		Scheduler: NewFCFSPodScheduler(as, func(node string) *api.Node {
			obj, _, _ := nodeStore.GetByKey(node)
			if obj == nil {
				return nil
			}
			return obj.(*api.Node)
		}),
		Schedcfg: *schedcfg.CreateDefaultConfig(),
	})

	assert.NotNil(testScheduler.client, "client is nil")
	assert.NotNil(testScheduler.executor, "executor is nil")
	assert.NotNil(testScheduler.offers, "offer registry is nil")

	// create scheduler process
	schedulerProcess := ha.New(testScheduler)

	// get plugin config from it
	c := testScheduler.NewPluginConfig(schedulerProcess.Terminal(), http.DefaultServeMux, &podListWatch.ListWatch)
	assert.NotNil(c)

	// make events observable
	eventObserver := NewEventObserver()
	c.Recorder = eventObserver

	// create plugin
	p := NewPlugin(c).(*schedulingPlugin)
	assert.NotNil(p)

	// run plugin
	p.Run(schedulerProcess.Terminal())
	defer schedulerProcess.End()

	// init scheduler
	err := testScheduler.Init(schedulerProcess.Master(), p, http.DefaultServeMux)
	assert.NoError(err)

	// create mock mesos scheduler driver
	mockDriver := &joinableDriver{}
	mockDriver.On("Start").Return(mesos.Status_DRIVER_RUNNING, nil).Once()
	started := mockDriver.Upon()

	mAny := mock.AnythingOfType
	mockDriver.On("ReconcileTasks", mAny("[]*mesosproto.TaskStatus")).Return(mesos.Status_DRIVER_RUNNING, nil)
	mockDriver.On("SendFrameworkMessage", mAny("*mesosproto.ExecutorID"), mAny("*mesosproto.SlaveID"), mAny("string")).
		Return(mesos.Status_DRIVER_RUNNING, nil)

	type LaunchedTask struct {
		offerId  mesos.OfferID
		taskInfo *mesos.TaskInfo
	}
	launchedTasks := make(chan LaunchedTask, 1)
	launchTasksCalledFunc := func(args mock.Arguments) {
		offerIDs := args.Get(0).([]*mesos.OfferID)
		taskInfos := args.Get(1).([]*mesos.TaskInfo)
		assert.Equal(1, len(offerIDs))
		assert.Equal(1, len(taskInfos))
		launchedTasks <- LaunchedTask{
			offerId:  *offerIDs[0],
			taskInfo: taskInfos[0],
		}
	}
	mockDriver.On("LaunchTasks", mAny("[]*mesosproto.OfferID"), mAny("[]*mesosproto.TaskInfo"), mAny("*mesosproto.Filters")).
		Return(mesos.Status_DRIVER_RUNNING, nil).Run(launchTasksCalledFunc)
	mockDriver.On("DeclineOffer", mAny("*mesosproto.OfferID"), mAny("*mesosproto.Filters")).
		Return(mesos.Status_DRIVER_RUNNING, nil)

	// elect master with mock driver
	driverFactory := ha.DriverFactory(func() (bindings.SchedulerDriver, error) {
		return mockDriver, nil
	})
	schedulerProcess.Elect(driverFactory)
	elected := schedulerProcess.Elected()

	// driver will be started
	<-started

	// tell scheduler to be registered
	testScheduler.Registered(
		mockDriver,
		util.NewFrameworkID("kubernetes-id"),
		util.NewMasterInfo("master-id", (192<<24)+(168<<16)+(0<<8)+1, 5050),
	)

	// wait for being elected
	<-elected

	//TODO(jdef) refactor things above here into a test suite setup of some sort

	// fake new, unscheduled pod
	pod, i := NewTestPod()
	podListWatch.Add(pod, true) // notify watchers

	// wait for failedScheduling event because there is no offer
	assert.EventWithReason(eventObserver, "failedScheduling", "failedScheduling event not received")

	// add some matching offer
	offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
	testScheduler.ResourceOffers(nil, offers)

	// and wait for scheduled pod
	assert.EventWithReason(eventObserver, "scheduled")
	select {
	case launchedTask := <-launchedTasks:
		// report back that the task has been staged, and then started by mesos
		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING))
		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING))

		// check that ExecutorInfo.data has the static pod data
		assert.Len(launchedTask.taskInfo.Executor.Data, 3)

		// report back that the task has been lost
		mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0)
		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_LOST))

		// and wait that framework message is sent to executor
		mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 1)

	case <-time.After(5 * time.Second):
		t.Fatalf("timed out waiting for launchTasks call")
	}

	// Launch a pod and wait until the scheduler driver is called
	schedulePodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
		// wait for failedScheduling event because there is no offer
		assert.EventWithReason(eventObserver, "failedScheduling", "failedScheduling event not received")

		// supply a matching offer
		testScheduler.ResourceOffers(mockDriver, offers)

		// and wait to get scheduled
		assert.EventWithReason(eventObserver, "scheduled")

		// wait for driver.launchTasks call
		select {
		case launchedTask := <-launchedTasks:
			for _, offer := range offers {
				if offer.Id.GetValue() == launchedTask.offerId.GetValue() {
					return pod, &launchedTask, offer
				}
			}
			t.Fatalf("unknown offer used to start a pod")
			return nil, nil, nil
		case <-time.After(5 * time.Second):
			t.Fatal("timed out waiting for launchTasks")
			return nil, nil, nil
		}
	}
	// Launch a pod and wait until the scheduler driver is called
	launchPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
		podListWatch.Add(pod, true)
		return schedulePodWithOffers(pod, offers)
	}

	// Launch a pod, wait until the scheduler driver is called and report back that it is running
	startPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
		// notify about pod, offer resources and wait for scheduling
		pod, launchedTask, offer := launchPodWithOffers(pod, offers)
		if pod != nil {
			// report back status
			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING))
			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING))
			return pod, launchedTask, offer
		}

		return nil, nil, nil
	}

	startTestPod := func() (*api.Pod, *LaunchedTask, *mesos.Offer) {
		pod, i := NewTestPod()
		offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
		return startPodWithOffers(pod, offers)
	}

	// start another pod
	pod, launchedTask, _ := startTestPod()

	// mock drvier.KillTask, should be invoked when a pod is deleted
	mockDriver.On("KillTask", mAny("*mesosproto.TaskID")).Return(mesos.Status_DRIVER_RUNNING, nil).Run(func(args mock.Arguments) {
		killedTaskId := *(args.Get(0).(*mesos.TaskID))
		assert.Equal(*launchedTask.taskInfo.TaskId, killedTaskId, "expected same TaskID as during launch")
	})
	killTaskCalled := mockDriver.Upon()

	// stop it again via the apiserver mock
	podListWatch.Delete(pod, true) // notify watchers

	// and wait for the driver killTask call with the correct TaskId
	select {
	case <-killTaskCalled:
		// report back that the task is finished
		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_FINISHED))

	case <-time.After(5 * time.Second):
		t.Fatal("timed out waiting for KillTask")
	}

	// start a pod with on a given NodeName and check that it is scheduled to the right host
	pod, i = NewTestPod()
	pod.Spec.NodeName = "hostname1"
	offers = []*mesos.Offer{}
	for j := 0; j < 3; j++ {
		offer := NewTestOffer(fmt.Sprintf("offer%d_%d", i, j))
		hostname := fmt.Sprintf("hostname%d", j)
		offer.Hostname = &hostname
		offers = append(offers, offer)
	}

	_, _, usedOffer := startPodWithOffers(pod, offers)

	assert.Equal(offers[1].Id.GetValue(), usedOffer.Id.GetValue())
	assert.Equal(pod.Spec.NodeName, *usedOffer.Hostname)

	testScheduler.OfferRescinded(mockDriver, offers[0].Id)
	testScheduler.OfferRescinded(mockDriver, offers[2].Id)

	// start pods:
	// - which are failing while binding,
	// - leading to reconciliation
	// - with different states on the apiserver

	failPodFromExecutor := func(task *mesos.TaskInfo) {
		beforePodLookups := testApiServer.Stats(pod.Name)
		status := newTaskStatusForTask(task, mesos.TaskState_TASK_FAILED)
		message := messages.CreateBindingFailure
		status.Message = &message
		testScheduler.StatusUpdate(mockDriver, status)

		// wait until pod is looked up at the apiserver
		assertext.EventuallyTrue(t, time.Second, func() bool {
			return testApiServer.Stats(pod.Name) == beforePodLookups+1
		}, "expect that reconcileTask will access apiserver for pod %v", pod.Name)
	}

	launchTestPod := func() (*api.Pod, *LaunchedTask, *mesos.Offer) {
		pod, i := NewTestPod()
		offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
		return launchPodWithOffers(pod, offers)
	}

	// 1. with pod deleted from the apiserver
	//    expected: pod is removed from internal task registry
	pod, launchedTask, _ = launchTestPod()
	podListWatch.Delete(pod, false) // not notifying the watchers
	failPodFromExecutor(launchedTask.taskInfo)

	podKey, _ := podtask.MakePodKey(api.NewDefaultContext(), pod.Name)
	assertext.EventuallyTrue(t, time.Second, func() bool {
		t, _ := p.api.tasks().ForPod(podKey)
		return t == nil
	})

	// 2. with pod still on the apiserver, not bound
	//    expected: pod is rescheduled
	pod, launchedTask, _ = launchTestPod()
	failPodFromExecutor(launchedTask.taskInfo)

	retryOffers := []*mesos.Offer{NewTestOffer("retry-offer")}
	schedulePodWithOffers(pod, retryOffers)

	// 3. with pod still on the apiserver, bound, notified via ListWatch
	// expected: nothing, pod updates not supported, compare ReconcileTask function
	pod, launchedTask, usedOffer = startTestPod()
	pod.Annotations = map[string]string{
		meta.BindingHostKey: *usedOffer.Hostname,
	}
	pod.Spec.NodeName = *usedOffer.Hostname
	podListWatch.Modify(pod, true) // notifying the watchers
	time.Sleep(time.Second / 2)
	failPodFromExecutor(launchedTask.taskInfo)
}
// TestScheduler_LifeCycle creates a scheduler plugin with the config returned by the scheduler,
// and plays through the whole life cycle of the plugin while creating pods, deleting
// and failing them.
func TestScheduler_LifeCycle(t *testing.T) {
	assert := &EventAssertions{*assert.New(t)}
	lt := newLifecycleTest(t)
	defer lt.Close()

	// run plugin
	launchedTasks := lt.Start()
	defer lt.End()

	// fake new, unscheduled pod
	pod, i := NewTestPod()
	lt.podsListWatch.Add(pod, true) // notify watchers

	// wait for failedScheduling event because there is no offer
	assert.EventWithReason(lt.eventObs, controller.FailedScheduling, "failedScheduling event not received")

	// add some matching offer
	offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
	lt.framework.ResourceOffers(nil, offers)

	// first offer is declined because node is not available yet
	lt.apiServer.WaitForNode("some_hostname")

	// add one more offer
	lt.framework.ResourceOffers(nil, offers)

	// and wait for scheduled pod
	assert.EventWithReason(lt.eventObs, controller.Scheduled)
	select {
	case launchedTask := <-launchedTasks:
		// report back that the task has been staged, and then started by mesos
		lt.framework.StatusUpdate(
			lt.driver,
			newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING),
		)

		lt.framework.StatusUpdate(
			lt.driver,
			newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING),
		)

		// check that ExecutorInfo.data has the static pod data
		assert.Len(launchedTask.taskInfo.Executor.Data, 3)

		// report back that the task has been lost
		lt.driver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0)

		lt.framework.StatusUpdate(
			lt.driver,
			newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_LOST),
		)

		// and wait that framework message is sent to executor
		lt.driver.AssertNumberOfCalls(t, "SendFrameworkMessage", 1)

	case <-time.After(util.ForeverTestTimeout):
		t.Fatalf("timed out waiting for launchTasks call")
	}

	offeredNodes := make(map[string]struct{})

	// Launch a pod and wait until the scheduler driver is called
	schedulePodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
		// wait for failedScheduling event because there is no offer
		assert.EventWithReason(lt.eventObs, controller.FailedScheduling, "failedScheduling event not received")

		// supply a matching offer
		lt.framework.ResourceOffers(lt.driver, offers)
		for _, offer := range offers {
			if _, ok := offeredNodes[offer.GetHostname()]; !ok {
				offeredNodes[offer.GetHostname()] = struct{}{}
				lt.apiServer.WaitForNode(offer.GetHostname())

				// reoffer since it must have been declined above
				lt.framework.ResourceOffers(lt.driver, []*mesos.Offer{offer})
			}
		}

		// and wait to get scheduled
		assert.EventWithReason(lt.eventObs, controller.Scheduled)

		// wait for driver.launchTasks call
		select {
		case launchedTask := <-launchedTasks:
			for _, offer := range offers {
				if offer.Id.GetValue() == launchedTask.offerId.GetValue() {
					return pod, &launchedTask, offer
				}
			}
			t.Fatalf("unknown offer used to start a pod")
			return nil, nil, nil
		case <-time.After(util.ForeverTestTimeout):
			t.Fatal("timed out waiting for launchTasks")
			return nil, nil, nil
		}
	}

	// Launch a pod and wait until the scheduler driver is called
	launchPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
		lt.podsListWatch.Add(pod, true)
		return schedulePodWithOffers(pod, offers)
	}

	// Launch a pod, wait until the scheduler driver is called and report back that it is running
	startPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
		// notify about pod, offer resources and wait for scheduling
		pod, launchedTask, offer := launchPodWithOffers(pod, offers)
		if pod != nil {
			// report back status
			lt.framework.StatusUpdate(
				lt.driver,
				newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING),
			)
			lt.framework.StatusUpdate(
				lt.driver,
				newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING),
			)

			return pod, launchedTask, offer
		}

		return nil, nil, nil
	}

	startTestPod := func() (*api.Pod, *LaunchedTask, *mesos.Offer) {
		pod, i := NewTestPod()
		offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
		return startPodWithOffers(pod, offers)
	}

	// start another pod
	pod, launchedTask, _ := startTestPod()

	// mock driver.KillTask, should be invoked when a pod is deleted
	lt.driver.On("KillTask",
		mock.AnythingOfType("*mesosproto.TaskID"),
	).Return(mesos.Status_DRIVER_RUNNING, nil).Run(func(args mock.Arguments) {
		killedTaskId := *(args.Get(0).(*mesos.TaskID))
		assert.Equal(*launchedTask.taskInfo.TaskId, killedTaskId, "expected same TaskID as during launch")
	})
	killTaskCalled := lt.driver.Upon()

	// stop it again via the apiserver mock
	lt.podsListWatch.Delete(pod, true) // notify watchers

	// and wait for the driver killTask call with the correct TaskId
	select {
	case <-killTaskCalled:
		// report back that the task is finished
		lt.framework.StatusUpdate(
			lt.driver,
			newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_FINISHED),
		)

	case <-time.After(util.ForeverTestTimeout):
		t.Fatal("timed out waiting for KillTask")
	}

	// start a pod with on a given NodeName and check that it is scheduled to the right host
	pod, i = NewTestPod()
	pod.Spec.NodeName = "hostname1"
	offers = []*mesos.Offer{}
	for j := 0; j < 3; j++ {
		offer := NewTestOffer(fmt.Sprintf("offer%d_%d", i, j))
		hostname := fmt.Sprintf("hostname%d", j)
		offer.Hostname = &hostname
		offers = append(offers, offer)
	}

	_, _, usedOffer := startPodWithOffers(pod, offers)

	assert.Equal(offers[1].Id.GetValue(), usedOffer.Id.GetValue())
	assert.Equal(pod.Spec.NodeName, *usedOffer.Hostname)

	lt.framework.OfferRescinded(lt.driver, offers[0].Id)
	lt.framework.OfferRescinded(lt.driver, offers[2].Id)

	// start pods:
	// - which are failing while binding,
	// - leading to reconciliation
	// - with different states on the apiserver

	failPodFromExecutor := func(task *mesos.TaskInfo) {
		beforePodLookups := lt.apiServer.Stats(pod.Name)
		status := newTaskStatusForTask(task, mesos.TaskState_TASK_FAILED)
		message := messages.CreateBindingFailure
		status.Message = &message
		lt.framework.StatusUpdate(lt.driver, status)

		// wait until pod is looked up at the apiserver
		assertext.EventuallyTrue(t, util.ForeverTestTimeout, func() bool {
			return lt.apiServer.Stats(pod.Name) == beforePodLookups+1
		}, "expect that reconcileTask will access apiserver for pod %v", pod.Name)
	}

	launchTestPod := func() (*api.Pod, *LaunchedTask, *mesos.Offer) {
		pod, i := NewTestPod()
		offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
		return launchPodWithOffers(pod, offers)
	}

	// 1. with pod deleted from the apiserver
	//    expected: pod is removed from internal task registry
	pod, launchedTask, _ = launchTestPod()
	lt.podsListWatch.Delete(pod, false) // not notifying the watchers
	failPodFromExecutor(launchedTask.taskInfo)

	podKey, _ := podtask.MakePodKey(api.NewDefaultContext(), pod.Name)
	assertext.EventuallyTrue(t, util.ForeverTestTimeout, func() bool {
		t, _ := lt.sched.Tasks().ForPod(podKey)
		return t == nil
	})

	// 2. with pod still on the apiserver, not bound
	//    expected: pod is rescheduled
	pod, launchedTask, _ = launchTestPod()
	failPodFromExecutor(launchedTask.taskInfo)

	retryOffers := []*mesos.Offer{NewTestOffer("retry-offer")}
	schedulePodWithOffers(pod, retryOffers)

	// 3. with pod still on the apiserver, bound, notified via ListWatch
	// expected: nothing, pod updates not supported, compare ReconcileTask function
	pod, launchedTask, usedOffer = startTestPod()
	pod.Annotations = map[string]string{
		meta.BindingHostKey: *usedOffer.Hostname,
	}
	pod.Spec.NodeName = *usedOffer.Hostname
	lt.podsListWatch.Modify(pod, true) // notifying the watchers
	time.Sleep(time.Second / 2)
	failPodFromExecutor(launchedTask.taskInfo)
}