func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error { // get scheduler low-level config sc := schedcfg.CreateDefaultConfig() if s.SchedulerConfigFileName != "" { f, err := os.Open(s.SchedulerConfigFileName) if err != nil { log.Fatalf("Cannot open scheduler config file: %v", err) } err = sc.Read(bufio.NewReader(f)) if err != nil { log.Fatalf("Invalid scheduler config file: %v", err) } } schedulerProcess, driverFactory, etcdClient, eid := s.bootstrap(hks, sc) if s.EnableProfiling { profile.InstallHandler(s.mux) } go runtime.Until(func() { log.V(1).Info("Starting HTTP interface") log.Error(http.ListenAndServe(net.JoinHostPort(s.Address.String(), strconv.Itoa(s.Port)), s.mux)) }, sc.HttpBindInterval.Duration, schedulerProcess.Terminal()) if s.HA { validation := ha.ValidationFunc(validateLeadershipTransition) srv := ha.NewCandidate(schedulerProcess, driverFactory, validation) path := fmt.Sprintf(meta.DefaultElectionFormat, s.FrameworkName) sid := uid.New(eid.Group(), "").String() log.Infof("registering for election at %v with id %v", path, sid) go election.Notify(election.NewEtcdMasterElector(etcdClient), path, sid, srv, nil) } else { log.Infoln("self-electing in non-HA mode") schedulerProcess.Elect(driverFactory) } return s.awaitFailover(schedulerProcess, func() error { return s.failover(s.getDriver(), hks) }) }
// Test to create the scheduler plugin with the config returned by the scheduler, // and play through the whole life cycle of the plugin while creating pods, deleting // and failing them. func TestPlugin_LifeCycle(t *testing.T) { t.Skip("disabled due to flakiness; see #10795") assert := &EventAssertions{*assert.New(t)} // create a fake pod watch. We use that below to submit new pods to the scheduler podListWatch := NewMockPodsListWatch(api.PodList{}) // create fake apiserver testApiServer := NewTestServer(t, api.NamespaceDefault, podListWatch) defer testApiServer.server.Close() // create executor with some data for static pods if set executor := util.NewExecutorInfo( util.NewExecutorID("executor-id"), util.NewCommandInfo("executor-cmd"), ) executor.Data = []byte{0, 1, 2} // create scheduler testScheduler := New(Config{ Executor: executor, Client: client.NewOrDie(&client.Config{Host: testApiServer.server.URL, Version: testapi.Version()}), ScheduleFunc: FCFSScheduleFunc, Schedcfg: *schedcfg.CreateDefaultConfig(), }) assert.NotNil(testScheduler.client, "client is nil") assert.NotNil(testScheduler.executor, "executor is nil") assert.NotNil(testScheduler.offers, "offer registry is nil") // create scheduler process schedulerProcess := ha.New(testScheduler) // get plugin config from it c := testScheduler.NewPluginConfig(schedulerProcess.Terminal(), http.DefaultServeMux, &podListWatch.ListWatch) assert.NotNil(c) // make events observable eventObserver := NewEventObserver() c.Recorder = eventObserver // create plugin p := NewPlugin(c) assert.NotNil(p) // run plugin p.Run(schedulerProcess.Terminal()) defer schedulerProcess.End() // init scheduler err := testScheduler.Init(schedulerProcess.Master(), p, http.DefaultServeMux) assert.NoError(err) // create mock mesos scheduler driver mockDriver := &joinableDriver{} mockDriver.On("Start").Return(mesos.Status_DRIVER_RUNNING, nil).Once() started := mockDriver.Upon() mAny := mock.AnythingOfType mockDriver.On("ReconcileTasks", mAny("[]*mesosproto.TaskStatus")).Return(mesos.Status_DRIVER_RUNNING, nil) mockDriver.On("SendFrameworkMessage", mAny("*mesosproto.ExecutorID"), mAny("*mesosproto.SlaveID"), mAny("string")). Return(mesos.Status_DRIVER_RUNNING, nil) launchedTasks := make(chan *mesos.TaskInfo, 1) launchTasksCalledFunc := func(args mock.Arguments) { taskInfos := args.Get(1).([]*mesos.TaskInfo) assert.Equal(1, len(taskInfos)) launchedTasks <- taskInfos[0] } mockDriver.On("LaunchTasks", mAny("[]*mesosproto.OfferID"), mAny("[]*mesosproto.TaskInfo"), mAny("*mesosproto.Filters")). Return(mesos.Status_DRIVER_RUNNING, nil).Run(launchTasksCalledFunc) // elect master with mock driver driverFactory := ha.DriverFactory(func() (bindings.SchedulerDriver, error) { return mockDriver, nil }) schedulerProcess.Elect(driverFactory) elected := schedulerProcess.Elected() // driver will be started <-started // tell scheduler to be registered testScheduler.Registered( mockDriver, util.NewFrameworkID("kubernetes-id"), util.NewMasterInfo("master-id", (192<<24)+(168<<16)+(0<<8)+1, 5050), ) // wait for being elected <-elected //TODO(jdef) refactor things above here into a test suite setup of some sort // fake new, unscheduled pod pod1 := NewTestPod(1) podListWatch.Add(pod1, true) // notify watchers // wait for failedScheduling event because there is no offer assert.EventWithReason(eventObserver, "failedScheduling", "failedScheduling event not received") // add some matching offer offers1 := []*mesos.Offer{NewTestOffer(1)} testScheduler.ResourceOffers(nil, offers1) // and wait for scheduled pod assert.EventWithReason(eventObserver, "scheduled") select { case launchedTask := <-launchedTasks: // report back that the task has been staged, and then started by mesos testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_STAGING)) testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_RUNNING)) // check that ExecutorInfo.data has the static pod data assert.Len(launchedTask.Executor.Data, 3) // report back that the task has been lost mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0) testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_LOST)) // and wait that framework message is sent to executor mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 1) case <-time.After(5 * time.Second): t.Fatalf("timed out waiting for launchTasks call") } // start another pod podNum := 1 startPod := func(offers []*mesos.Offer) (*api.Pod, *mesos.TaskInfo) { podNum = podNum + 1 // create pod and matching offer pod := NewTestPod(podNum) podListWatch.Add(pod, true) // notify watchers testScheduler.ResourceOffers(mockDriver, offers) assert.EventWithReason(eventObserver, "scheduled") // wait for driver.launchTasks call select { case launchedTask := <-launchedTasks: testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_STAGING)) testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_RUNNING)) return pod, launchedTask case <-time.After(5 * time.Second): t.Fatal("timed out waiting for launchTasks") return nil, nil } } pod, launchedTask := startPod(offers1) // mock drvier.KillTask, should be invoked when a pod is deleted mockDriver.On("KillTask", mAny("*mesosproto.TaskID")).Return(mesos.Status_DRIVER_RUNNING, nil).Run(func(args mock.Arguments) { killedTaskId := *(args.Get(0).(*mesos.TaskID)) assert.Equal(*launchedTask.TaskId, killedTaskId, "expected same TaskID as during launch") }) killTaskCalled := mockDriver.Upon() // stop it again via the apiserver mock podListWatch.Delete(pod, true) // notify watchers // and wait for the driver killTask call with the correct TaskId select { case <-killTaskCalled: // report back that the task is finished testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_FINISHED)) case <-time.After(5 * time.Second): t.Fatal("timed out waiting for KillTask") } // start pods: // - which are failing while binding, // - leading to reconciliation // - with different states on the apiserver failPodFromExecutor := func(task *mesos.TaskInfo) { beforePodLookups := testApiServer.Stats(pod.Name) status := newTaskStatusForTask(task, mesos.TaskState_TASK_FAILED) message := messages.CreateBindingFailure status.Message = &message testScheduler.StatusUpdate(mockDriver, status) // wait until pod is looked up at the apiserver assertext.EventuallyTrue(t, time.Second, func() bool { return testApiServer.Stats(pod.Name) == beforePodLookups+1 }, "expect that reconcilePod will access apiserver for pod %v", pod.Name) } // 1. with pod deleted from the apiserver pod, launchedTask = startPod(offers1) podListWatch.Delete(pod, false) // not notifying the watchers failPodFromExecutor(launchedTask) // 2. with pod still on the apiserver, not bound pod, launchedTask = startPod(offers1) failPodFromExecutor(launchedTask) // 3. with pod still on the apiserver, bound i.e. host!="" pod, launchedTask = startPod(offers1) pod.Spec.NodeName = *offers1[0].Hostname podListWatch.Modify(pod, false) // not notifying the watchers failPodFromExecutor(launchedTask) // 4. with pod still on the apiserver, bound i.e. host!="", notified via ListWatch pod, launchedTask = startPod(offers1) pod.Spec.NodeName = *offers1[0].Hostname podListWatch.Modify(pod, true) // notifying the watchers time.Sleep(time.Second / 2) failPodFromExecutor(launchedTask) }