// start expected to be guarded by eventLock func (driver *MesosSchedulerDriver) start() (mesos.Status, error) { select { case <-driver.started: return driver.status, errors.New("Unable to Start: driver has already been started once.") default: // proceed } log.Infoln("Starting the scheduler driver...") if driver.status != mesos.Status_DRIVER_NOT_STARTED { return driver.status, fmt.Errorf("Unable to Start, expecting driver status %s, but is %s:", mesos.Status_DRIVER_NOT_STARTED, driver.status) } // Start the messenger. if err := driver.messenger.Start(); err != nil { log.Errorf("Scheduler failed to start the messenger: %v\n", err) return driver.status, err } pid := driver.messenger.UPID() driver.self = &pid driver.status = mesos.Status_DRIVER_RUNNING close(driver.started) log.Infof("Mesos scheduler driver started with PID=%v", driver.self) listener := detector.OnMasterChanged(func(m *mesos.MasterInfo) { driver.messenger.Route(context.TODO(), driver.self, &mesos.InternalMasterChangeDetected{ Master: m, }) }) if driver.masterDetector != nil { // register with Detect() AFTER we have a self pid from the messenger, otherwise things get ugly // because our internal messaging depends on it. detector callbacks are routed over the messenger // bus, maintaining serial (concurrency-safe) callback execution. log.V(1).Infof("starting master detector %T: %+v", driver.masterDetector, driver.masterDetector) driver.masterDetector.Detect(listener) log.V(2).Infoln("master detector started") } return driver.status, nil }
"github.com/basho-labs/mesos-go/detector" mesos "github.com/basho-labs/mesos-go/mesosproto" "github.com/gogo/protobuf/proto" log "github.com/golang/glog" ) const ( // prefix for nodes listed at the ZK URL path nodePrefix = "info_" nodeJSONPrefix = "json.info_" defaultMinDetectorCyclePeriod = 1 * time.Second ) // reasonable default for a noop change listener var ignoreChanged = detector.OnMasterChanged(func(*mesos.MasterInfo) {}) type zkInterface interface { stopped() <-chan struct{} stop() data(string) ([]byte, error) watchChildren(string) (string, <-chan []string, <-chan error) } type infoCodec func(path, node string) (*mesos.MasterInfo, error) // Detector uses ZooKeeper to detect new leading master. type MasterDetector struct { client zkInterface leaderNode string
// single connector instance, it's internal connection to zk is flappy func TestMasterDetectorFlappyConnectionState(t *testing.T) { md, err := NewMasterDetector(zkurl) defer md.Cancel() assert.NoError(t, err) const ITERATIONS = 3 var wg sync.WaitGroup wg.Add(1 + ITERATIONS) // +1 for the initial snapshot that's sent for the first watch path := test_zk_path md.bootstrapFunc = func() error { if md.client != nil { return nil } log.V(1).Infoln("bootstrapping detector") defer log.V(1).Infoln("bootstrapping detector ..finished") children := []string{"info_0", "info_5", "info_10"} mocked, snaps, errs := newMockZkClient(children...) md.client = mocked md.minDetectorCyclePeriod = 10 * time.Millisecond // we don't have all day! mocked.On("data", fmt.Sprintf("%s/info_0", path)).Return(newTestMasterInfo(0), nil) // the first snapshot will be sent immediately and the detector will be awaiting en event. // cycle through some connected/disconnected events but maintain the same snapshot go func() { defer close(errs) for attempt := 0; attempt < ITERATIONS; attempt++ { // send an error, should cause the detector to re-issue a watch errs <- zk.ErrSessionExpired // the detection loop issues another watch, so send it a snapshot.. // send another snapshot snaps <- children } }() return nil } called := 0 lostMaster := make(chan struct{}) const EXPECTED_CALLS = (ITERATIONS * 2) + 2 // +1 for initial snapshot, +1 for final lost-leader (close(errs)) err = md.Detect(detector.OnMasterChanged(func(master *mesos.MasterInfo) { called++ log.V(3).Infof("detector invoked: called %d", called) switch { case called < EXPECTED_CALLS: if master != nil { wg.Done() assert.Equal(t, master.GetId(), "master(0)@localhost:5050") } case called == EXPECTED_CALLS: md.Cancel() defer close(lostMaster) assert.Nil(t, master) default: t.Errorf("unexpected notification call attempt %d", called) } })) assert.NoError(t, err) fatalAfter(t, 10*time.Second, wg.Wait, "Waited too long for new-master alerts") fatalOn(t, 3*time.Second, lostMaster, "Waited too long for lost master") select { case <-md.Done(): assert.Equal(t, EXPECTED_CALLS, called, "expected %d detection callbacks instead of %d", EXPECTED_CALLS, called) case <-time.After(time.Second * 10): panic("Waited too long for detector shutdown...") } }
func TestMasterDetector_multipleLeadershipChanges(t *testing.T) { md, err := NewMasterDetector(zkurl) defer md.Cancel() assert.NoError(t, err) leadershipChanges := [][]string{ {"info_014", "info_010", "info_005"}, {"info_005", "info_004", "info_022"}, {}, // indicates no master {"info_017", "info_099", "info_200"}, } ITERATIONS := len(leadershipChanges) // +1 for initial snapshot, +1 for final lost-leader (close(errs)) EXPECTED_CALLS := (ITERATIONS + 2) var wg sync.WaitGroup wg.Add(ITERATIONS) // +1 for the initial snapshot that's sent for the first watch, -1 because set 3 is empty path := test_zk_path md.bootstrapFunc = func() error { if md.client != nil { return nil } log.V(1).Infoln("bootstrapping detector") defer log.V(1).Infoln("bootstrapping detector ..finished") children := []string{"info_0", "info_5", "info_10"} mocked, snaps, errs := newMockZkClient(children...) md.client = mocked md.minDetectorCyclePeriod = 10 * time.Millisecond // we don't have all day! mocked.On("data", fmt.Sprintf("%s/info_0", path)).Return(newTestMasterInfo(0), nil) mocked.On("data", fmt.Sprintf("%s/info_005", path)).Return(newTestMasterInfo(5), nil) mocked.On("data", fmt.Sprintf("%s/info_004", path)).Return(newTestMasterInfo(4), nil) mocked.On("data", fmt.Sprintf("%s/info_017", path)).Return(newTestMasterInfo(17), nil) // the first snapshot will be sent immediately and the detector will be awaiting en event. // cycle through some connected/disconnected events but maintain the same snapshot go func() { defer close(errs) for attempt := 0; attempt < ITERATIONS; attempt++ { snaps <- leadershipChanges[attempt] } }() return nil } called := 0 lostMaster := make(chan struct{}) expectedLeaders := []int{0, 5, 4, 17} leaderIdx := 0 err = md.Detect(detector.OnMasterChanged(func(master *mesos.MasterInfo) { called++ log.V(3).Infof("detector invoked: called %d", called) switch { case called < EXPECTED_CALLS: if master != nil { expectedLeader := fmt.Sprintf("master(%d)@localhost:5050", expectedLeaders[leaderIdx]) assert.Equal(t, expectedLeader, master.GetId()) leaderIdx++ wg.Done() } case called == EXPECTED_CALLS: md.Cancel() defer close(lostMaster) assert.Nil(t, master) default: t.Errorf("unexpected notification call attempt %d", called) } })) assert.NoError(t, err) fatalAfter(t, 10*time.Second, wg.Wait, "Waited too long for new-master alerts") fatalOn(t, 3*time.Second, lostMaster, "Waited too long for lost master") select { case <-md.Done(): assert.Equal(t, EXPECTED_CALLS, called, "expected %d detection callbacks instead of %d", EXPECTED_CALLS, called) case <-time.After(time.Second * 10): panic("Waited too long for detector shutdown...") } }
func TestMasterDetectorChildrenChanged(t *testing.T) { md, err := NewMasterDetector(zkurl) defer md.Cancel() assert.NoError(t, err) path := test_zk_path snapDetected := make(chan struct{}) md.bootstrapFunc = func() error { if md.client != nil { return nil } log.V(1).Infoln("bootstrapping detector") defer log.V(1).Infoln("bootstrapping detector ..finished") mocked, _, errs := newMockZkClient("info_0", "info_5", "info_10") md.client = mocked md.minDetectorCyclePeriod = 10 * time.Millisecond // we don't have all day! mocked.On("data", fmt.Sprintf("%s/info_0", path)).Return(newTestMasterInfo(0), nil) // wait for the first child snapshot to be processed before signaling end-of-watch // (which is signalled by closing errs). go func() { defer close(errs) select { case <-snapDetected: case <-md.Done(): t.Errorf("detector died before child snapshot") } }() return nil } called := 0 lostMaster := make(chan struct{}) const expectedLeader = "master(0)@localhost:5050" err = md.Detect(detector.OnMasterChanged(func(master *mesos.MasterInfo) { //expect 2 calls in sequence: the first setting a master //and the second clearing it switch called++; called { case 1: defer close(snapDetected) assert.NotNil(t, master) assert.Equal(t, expectedLeader, master.GetId()) case 2: md.Cancel() defer close(lostMaster) assert.Nil(t, master) default: t.Errorf("unexpected notification call attempt %d", called) } })) assert.NoError(t, err) fatalOn(t, 10*time.Second, lostMaster, "Waited too long for lost master") select { case <-md.Done(): assert.Equal(t, 2, called, "expected 2 detection callbacks instead of %d", called) case <-time.After(time.Second * 10): panic("Waited too long for detector shutdown...") } }