// New creates a new KubernetesScheduler func New(config Config) *KubernetesScheduler { var k *KubernetesScheduler k = &KubernetesScheduler{ schedcfg: &config.Schedcfg, RWMutex: new(sync.RWMutex), executor: config.Executor, executorGroup: uid.Parse(config.Executor.ExecutorId.GetValue()).Group(), PodScheduler: config.Scheduler, client: config.Client, etcdClient: config.EtcdClient, failoverTimeout: config.FailoverTimeout, reconcileInterval: config.ReconcileInterval, nodeRegistrator: node.NewRegistrator(config.Client, config.LookupNode), offers: offers.CreateRegistry(offers.RegistryConfig{ Compat: func(o *mesos.Offer) bool { // the node must be registered and have up-to-date labels n := config.LookupNode(o.GetHostname()) if n == nil || !node.IsUpToDate(n, node.SlaveAttributesToLabels(o.GetAttributes())) { return false } // the executor IDs must not identify a kubelet-executor with a group that doesn't match ours for _, eid := range o.GetExecutorIds() { execuid := uid.Parse(eid.GetValue()) if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup { return false } } return true }, DeclineOffer: func(id string) <-chan error { errOnce := proc.NewErrorOnce(k.terminate) errOuter := k.asRegisteredMaster.Do(func() { var err error defer errOnce.Report(err) offerId := mutil.NewOfferID(id) filters := &mesos.Filters{} _, err = k.driver.DeclineOffer(offerId, filters) }) return errOnce.Send(errOuter).Err() }, // remember expired offers so that we can tell if a previously scheduler offer relies on one LingerTTL: config.Schedcfg.OfferLingerTTL.Duration, TTL: config.Schedcfg.OfferTTL.Duration, ListenerDelay: config.Schedcfg.ListenerDelay.Duration, }), slaveHostNames: slave.NewRegistry(), taskRegistry: podtask.NewInMemoryRegistry(), reconcileCooldown: config.ReconcileCooldown, registration: make(chan struct{}), asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error { return proc.ErrorChanf("cannot execute action with unregistered scheduler") }), } return k }
//test that when a slave is lost we remove all offers func TestSlave_Lost(t *testing.T) { assert := assert.New(t) // testFramework := &framework{ offers: offers.CreateRegistry(offers.RegistryConfig{ Compat: func(o *mesos.Offer) bool { return true }, // remember expired offers so that we can tell if a previously scheduler offer relies on one LingerTTL: schedcfg.DefaultOfferLingerTTL, TTL: schedcfg.DefaultOfferTTL, ListenerDelay: schedcfg.DefaultListenerDelay, }), slaveHostNames: newSlaveRegistry(), sched: mockScheduler(), } hostname := "h1" offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} offers1 := []*mesos.Offer{offer1} testFramework.ResourceOffers(nil, offers1) offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} offers2 := []*mesos.Offer{offer2} testFramework.ResourceOffers(nil, offers2) //add another offer from different slaveID hostname2 := "h2" offer3 := &mesos.Offer{Id: util.NewOfferID("test3"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)} offers3 := []*mesos.Offer{offer3} testFramework.ResourceOffers(nil, offers3) //test precondition assert.Equal(3, getNumberOffers(testFramework.offers)) assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs())) //remove first slave testFramework.SlaveLost(nil, util.NewSlaveID(hostname)) //offers should be removed assert.Equal(1, getNumberOffers(testFramework.offers)) //slave hostnames should still be all present assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs())) //remove second slave testFramework.SlaveLost(nil, util.NewSlaveID(hostname2)) //offers should be removed assert.Equal(0, getNumberOffers(testFramework.offers)) //slave hostnames should still be all present assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs())) //try to remove non existing slave testFramework.SlaveLost(nil, util.NewSlaveID("notExist")) }
//test adding of ressource offer, should be added to offer registry and slavesf func TestResourceOffer_Add_Rescind(t *testing.T) { assert := assert.New(t) testFramework := &framework{ offers: offers.CreateRegistry(offers.RegistryConfig{ Compat: func(o *mesos.Offer) bool { return true }, DeclineOffer: func(offerId string) <-chan error { return proc.ErrorChan(nil) }, // remember expired offers so that we can tell if a previously scheduler offer relies on one LingerTTL: schedcfg.DefaultOfferLingerTTL, TTL: schedcfg.DefaultOfferTTL, ListenerDelay: schedcfg.DefaultListenerDelay, }), slaveHostNames: newSlaveRegistry(), sched: mockScheduler(), } hostname := "h1" offerID1 := util.NewOfferID("test1") offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} offers1 := []*mesos.Offer{offer1} testFramework.ResourceOffers(nil, offers1) assert.Equal(1, getNumberOffers(testFramework.offers)) //check slave hostname assert.Equal(1, len(testFramework.slaveHostNames.SlaveIDs())) //add another offer hostname2 := "h2" offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)} offers2 := []*mesos.Offer{offer2} testFramework.ResourceOffers(nil, offers2) assert.Equal(2, getNumberOffers(testFramework.offers)) //check slave hostnames assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs())) //next whether offers can be rescinded testFramework.OfferRescinded(nil, offerID1) assert.Equal(1, getNumberOffers(testFramework.offers)) //next whether offers can be rescinded testFramework.OfferRescinded(nil, util.NewOfferID("test2")) //walk offers again and check it is removed from registry assert.Equal(0, getNumberOffers(testFramework.offers)) //remove non existing ID testFramework.OfferRescinded(nil, util.NewOfferID("notExist")) }
//test adding of ressource offer, should be added to offer registry and slaves func TestResourceOffer_Add(t *testing.T) { assert := assert.New(t) registrator := &mockRegistrator{cache.NewStore(cache.MetaNamespaceKeyFunc)} testFramework := &framework{ offers: offers.CreateRegistry(offers.RegistryConfig{ Compat: func(o *mesos.Offer) bool { return true }, DeclineOffer: func(offerId string) <-chan error { return proc.ErrorChan(nil) }, // remember expired offers so that we can tell if a previously scheduler offer relies on one LingerTTL: schedcfg.DefaultOfferLingerTTL, TTL: schedcfg.DefaultOfferTTL, ListenerDelay: schedcfg.DefaultListenerDelay, }), slaveHostNames: newSlaveRegistry(), nodeRegistrator: registrator, sched: mockScheduler(), } hostname := "h1" offerID1 := util.NewOfferID("test1") offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} offers1 := []*mesos.Offer{offer1} testFramework.ResourceOffers(nil, offers1) assert.Equal(1, len(registrator.store.List())) assert.Equal(1, getNumberOffers(testFramework.offers)) //check slave hostname assert.Equal(1, len(testFramework.slaveHostNames.SlaveIDs())) //add another offer hostname2 := "h2" offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)} offers2 := []*mesos.Offer{offer2} testFramework.ResourceOffers(nil, offers2) //check it is stored in registry assert.Equal(2, getNumberOffers(testFramework.offers)) //check slave hostnames assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs())) }
//test we can handle different status updates, TODO check state transitions func TestStatus_Update(t *testing.T) { mockdriver := MockSchedulerDriver{} // setup expectations mockdriver.On("KillTask", util.NewTaskID("test-task-001")).Return(mesos.Status_DRIVER_RUNNING, nil) testFramework := &framework{ offers: offers.CreateRegistry(offers.RegistryConfig{ Compat: func(o *mesos.Offer) bool { return true }, // remember expired offers so that we can tell if a previously scheduler offer relies on one LingerTTL: schedcfg.DefaultOfferLingerTTL, TTL: schedcfg.DefaultOfferTTL, ListenerDelay: schedcfg.DefaultListenerDelay, }), slaveHostNames: newSlaveRegistry(), driver: &mockdriver, sched: mockScheduler(), } taskStatus_task_starting := util.NewTaskStatus( util.NewTaskID("test-task-001"), mesos.TaskState_TASK_RUNNING, ) testFramework.StatusUpdate(testFramework.driver, taskStatus_task_starting) taskStatus_task_running := util.NewTaskStatus( util.NewTaskID("test-task-001"), mesos.TaskState_TASK_RUNNING, ) testFramework.StatusUpdate(testFramework.driver, taskStatus_task_running) taskStatus_task_failed := util.NewTaskStatus( util.NewTaskID("test-task-001"), mesos.TaskState_TASK_FAILED, ) testFramework.StatusUpdate(testFramework.driver, taskStatus_task_failed) //assert that mock was invoked mockdriver.AssertExpectations(t) }
//test adding of ressource offer, should be added to offer registry and slavesf func TestResourceOffer_Add(t *testing.T) { assert := assert.New(t) testScheduler := &KubernetesScheduler{ offers: offers.CreateRegistry(offers.RegistryConfig{ Compat: func(o *mesos.Offer) bool { return true }, DeclineOffer: func(offerId string) <-chan error { return proc.ErrorChan(nil) }, // remember expired offers so that we can tell if a previously scheduler offer relies on one LingerTTL: schedcfg.DefaultOfferLingerTTL, TTL: schedcfg.DefaultOfferTTL, ListenerDelay: schedcfg.DefaultListenerDelay, }), slaves: newSlaveStorage(), } hostname := "h1" offerID1 := util.NewOfferID("test1") offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} offers1 := []*mesos.Offer{offer1} testScheduler.ResourceOffers(nil, offers1) assert.Equal(1, getNumberOffers(testScheduler.offers)) //check slave hostname assert.Equal(1, len(testScheduler.slaves.getSlaveIds())) //add another offer hostname2 := "h2" offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)} offers2 := []*mesos.Offer{offer2} testScheduler.ResourceOffers(nil, offers2) //check it is stored in registry assert.Equal(2, getNumberOffers(testScheduler.offers)) //check slave hostnames assert.Equal(2, len(testScheduler.slaves.getSlaveIds())) }
//test when we loose connection to master we invalidate all cached offers func TestDisconnect(t *testing.T) { assert := assert.New(t) // testScheduler := &KubernetesScheduler{ offers: offers.CreateRegistry(offers.RegistryConfig{ Compat: func(o *mesos.Offer) bool { return true }, // remember expired offers so that we can tell if a previously scheduler offer relies on one LingerTTL: schedcfg.DefaultOfferLingerTTL, TTL: schedcfg.DefaultOfferTTL, ListenerDelay: schedcfg.DefaultListenerDelay, }), slaveHostNames: slave.NewRegistry(), } hostname := "h1" offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} offers1 := []*mesos.Offer{offer1} testScheduler.ResourceOffers(nil, offers1) offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} offers2 := []*mesos.Offer{offer2} testScheduler.ResourceOffers(nil, offers2) //add another offer from different slaveID hostname2 := "h2" offer3 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)} offers3 := []*mesos.Offer{offer3} testScheduler.ResourceOffers(nil, offers3) //disconnect testScheduler.Disconnected(nil) //all offers should be removed assert.Equal(0, getNumberOffers(testScheduler.offers)) //slave hostnames should still be all present assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs())) }
// New creates a new Framework func New(config Config) Framework { var k *framework k = &framework{ schedulerConfig: &config.SchedulerConfig, RWMutex: new(sync.RWMutex), client: config.Client, failoverTimeout: config.FailoverTimeout, reconcileInterval: config.ReconcileInterval, nodeRegistrator: node.NewRegistrator(config.Client, config.LookupNode), executorId: config.ExecutorId, offers: offers.CreateRegistry(offers.RegistryConfig{ Compat: func(o *mesos.Offer) bool { // the node must be registered and have up-to-date labels n := config.LookupNode(o.GetHostname()) if n == nil || !node.IsUpToDate(n, node.SlaveAttributesToLabels(o.GetAttributes())) { return false } eids := len(o.GetExecutorIds()) switch { case eids > 1: // at most one executor id expected. More than one means that // the given node is seriously in trouble. return false case eids == 1: // the executor id must match, otherwise the running executor // is incompatible with the current scheduler configuration. if eid := o.GetExecutorIds()[0]; eid.GetValue() != config.ExecutorId.GetValue() { return false } } return true }, DeclineOffer: func(id string) <-chan error { errOnce := proc.NewErrorOnce(k.terminate) errOuter := k.asRegisteredMaster.Do(func() { var err error defer errOnce.Report(err) offerId := mutil.NewOfferID(id) filters := &mesos.Filters{} _, err = k.driver.DeclineOffer(offerId, filters) }) return errOnce.Send(errOuter).Err() }, // remember expired offers so that we can tell if a previously scheduler offer relies on one LingerTTL: config.SchedulerConfig.OfferLingerTTL.Duration, TTL: config.SchedulerConfig.OfferTTL.Duration, ListenerDelay: config.SchedulerConfig.ListenerDelay.Duration, }), slaveHostNames: newSlaveRegistry(), reconcileCooldown: config.ReconcileCooldown, registration: make(chan struct{}), asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error { return proc.ErrorChanf("cannot execute action with unregistered scheduler") }), storeFrameworkId: config.StoreFrameworkId, lookupNode: config.LookupNode, } return k }
func TestInMemoryRegistry_Update(t *testing.T) { assert := assert.New(t) // create offers registry ttl := time.Second / 4 config := offers.RegistryConfig{ DeclineOffer: func(offerId string) <-chan error { return proc.ErrorChan(nil) }, Compat: func(o *mesos.Offer) bool { return true }, TTL: ttl, LingerTTL: 2 * ttl, } storage := offers.CreateRegistry(config) // Add offer offerId := mesosutil.NewOfferID("foo") mesosOffer := &mesos.Offer{Id: offerId} storage.Add([]*mesos.Offer{mesosOffer}) offer, ok := storage.Get(offerId.GetValue()) assert.True(ok) // create registry registry := NewInMemoryRegistry() a, _ := fakePodTask("a") registry.Register(a.Clone(), nil) // here clone a because we change it below // state changes are ignored a.State = StateRunning err := registry.Update(a) assert.NoError(err) a_clone, _ := registry.Get(a.ID) assert.Equal(StatePending, a_clone.State) // offer is updated while pending a.Offer = offer err = registry.Update(a) assert.NoError(err) a_clone, _ = registry.Get(a.ID) assert.Equal(offer.Id(), a_clone.Offer.Id()) // spec is updated while pending a.Spec = Spec{SlaveID: "slave-1"} err = registry.Update(a) assert.NoError(err) a_clone, _ = registry.Get(a.ID) assert.Equal("slave-1", a_clone.Spec.SlaveID) // flags are updated while pending a.Flags[Launched] = struct{}{} err = registry.Update(a) assert.NoError(err) a_clone, _ = registry.Get(a.ID) _, found_launched := a_clone.Flags[Launched] assert.True(found_launched) // flags are updated while running registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_RUNNING)) a.Flags[Bound] = struct{}{} err = registry.Update(a) assert.NoError(err) a_clone, _ = registry.Get(a.ID) _, found_launched = a_clone.Flags[Launched] assert.True(found_launched) _, found_bound := a_clone.Flags[Bound] assert.True(found_bound) // spec is ignored while running a.Spec = Spec{SlaveID: "slave-2"} err = registry.Update(a) assert.NoError(err) a_clone, _ = registry.Get(a.ID) assert.Equal("slave-1", a_clone.Spec.SlaveID) // error when finished registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_FINISHED)) err = registry.Update(a) assert.Error(err) // update unknown task unknown_task, _ := fakePodTask("unknown-task") err = registry.Update(unknown_task) assert.Error(err) // update nil task err = registry.Update(nil) assert.Nil(err) }