Example #1
0
// start expected to be guarded by eventLock
func (driver *MesosSchedulerDriver) start() (mesos.Status, error) {
	select {
	case <-driver.started:
		return driver.status, errors.New("Unable to Start: driver has already been started once.")
	default: // proceed
	}

	log.Infoln("Starting the scheduler driver...")

	if driver.status != mesos.Status_DRIVER_NOT_STARTED {
		return driver.status, fmt.Errorf("Unable to Start, expecting driver status %s, but is %s:", mesos.Status_DRIVER_NOT_STARTED, driver.status)
	}

	// Start the messenger.
	if err := driver.messenger.Start(); err != nil {
		log.Errorf("Scheduler failed to start the messenger: %v\n", err)
		return driver.status, err
	}

	pid := driver.messenger.UPID()
	driver.self = &pid
	driver.status = mesos.Status_DRIVER_RUNNING
	close(driver.started)

	log.Infof("Mesos scheduler driver started with PID=%v", driver.self)

	listener := detector.OnMasterChanged(func(m *mesos.MasterInfo) {
		driver.messenger.Route(context.TODO(), driver.self, &mesos.InternalMasterChangeDetected{
			Master: m,
		})
	})

	if driver.masterDetector != nil {
		// register with Detect() AFTER we have a self pid from the messenger, otherwise things get ugly
		// because our internal messaging depends on it. detector callbacks are routed over the messenger
		// bus, maintaining serial (concurrency-safe) callback execution.
		log.V(1).Infof("starting master detector %T: %+v", driver.masterDetector, driver.masterDetector)
		driver.masterDetector.Detect(listener)
		log.V(2).Infoln("master detector started")
	}
	return driver.status, nil
}
Example #2
0
	"github.com/basho-labs/mesos-go/detector"
	mesos "github.com/basho-labs/mesos-go/mesosproto"
	"github.com/gogo/protobuf/proto"
	log "github.com/golang/glog"
)

const (
	// prefix for nodes listed at the ZK URL path
	nodePrefix                    = "info_"
	nodeJSONPrefix                = "json.info_"
	defaultMinDetectorCyclePeriod = 1 * time.Second
)

// reasonable default for a noop change listener
var ignoreChanged = detector.OnMasterChanged(func(*mesos.MasterInfo) {})

type zkInterface interface {
	stopped() <-chan struct{}
	stop()
	data(string) ([]byte, error)
	watchChildren(string) (string, <-chan []string, <-chan error)
}

type infoCodec func(path, node string) (*mesos.MasterInfo, error)

// Detector uses ZooKeeper to detect new leading master.
type MasterDetector struct {
	client     zkInterface
	leaderNode string
Example #3
0
// single connector instance, it's internal connection to zk is flappy
func TestMasterDetectorFlappyConnectionState(t *testing.T) {
	md, err := NewMasterDetector(zkurl)
	defer md.Cancel()
	assert.NoError(t, err)

	const ITERATIONS = 3
	var wg sync.WaitGroup
	wg.Add(1 + ITERATIONS) // +1 for the initial snapshot that's sent for the first watch
	path := test_zk_path

	md.bootstrapFunc = func() error {
		if md.client != nil {
			return nil
		}
		log.V(1).Infoln("bootstrapping detector")
		defer log.V(1).Infoln("bootstrapping detector ..finished")

		children := []string{"info_0", "info_5", "info_10"}
		mocked, snaps, errs := newMockZkClient(children...)
		md.client = mocked
		md.minDetectorCyclePeriod = 10 * time.Millisecond // we don't have all day!

		mocked.On("data", fmt.Sprintf("%s/info_0", path)).Return(newTestMasterInfo(0), nil)

		// the first snapshot will be sent immediately and the detector will be awaiting en event.
		// cycle through some connected/disconnected events but maintain the same snapshot
		go func() {
			defer close(errs)
			for attempt := 0; attempt < ITERATIONS; attempt++ {
				// send an error, should cause the detector to re-issue a watch
				errs <- zk.ErrSessionExpired
				// the detection loop issues another watch, so send it a snapshot..
				// send another snapshot
				snaps <- children
			}
		}()
		return nil
	}

	called := 0
	lostMaster := make(chan struct{})
	const EXPECTED_CALLS = (ITERATIONS * 2) + 2 // +1 for initial snapshot, +1 for final lost-leader (close(errs))
	err = md.Detect(detector.OnMasterChanged(func(master *mesos.MasterInfo) {
		called++
		log.V(3).Infof("detector invoked: called %d", called)
		switch {
		case called < EXPECTED_CALLS:
			if master != nil {
				wg.Done()
				assert.Equal(t, master.GetId(), "master(0)@localhost:5050")
			}
		case called == EXPECTED_CALLS:
			md.Cancel()
			defer close(lostMaster)
			assert.Nil(t, master)
		default:
			t.Errorf("unexpected notification call attempt %d", called)
		}
	}))
	assert.NoError(t, err)

	fatalAfter(t, 10*time.Second, wg.Wait, "Waited too long for new-master alerts")
	fatalOn(t, 3*time.Second, lostMaster, "Waited too long for lost master")

	select {
	case <-md.Done():
		assert.Equal(t, EXPECTED_CALLS, called, "expected %d detection callbacks instead of %d", EXPECTED_CALLS, called)
	case <-time.After(time.Second * 10):
		panic("Waited too long for detector shutdown...")
	}
}
Example #4
0
func TestMasterDetector_multipleLeadershipChanges(t *testing.T) {
	md, err := NewMasterDetector(zkurl)
	defer md.Cancel()
	assert.NoError(t, err)

	leadershipChanges := [][]string{
		{"info_014", "info_010", "info_005"},
		{"info_005", "info_004", "info_022"},
		{}, // indicates no master
		{"info_017", "info_099", "info_200"},
	}

	ITERATIONS := len(leadershipChanges)

	// +1 for initial snapshot, +1 for final lost-leader (close(errs))
	EXPECTED_CALLS := (ITERATIONS + 2)

	var wg sync.WaitGroup
	wg.Add(ITERATIONS) // +1 for the initial snapshot that's sent for the first watch, -1 because set 3 is empty
	path := test_zk_path

	md.bootstrapFunc = func() error {
		if md.client != nil {
			return nil
		}
		log.V(1).Infoln("bootstrapping detector")
		defer log.V(1).Infoln("bootstrapping detector ..finished")

		children := []string{"info_0", "info_5", "info_10"}
		mocked, snaps, errs := newMockZkClient(children...)
		md.client = mocked
		md.minDetectorCyclePeriod = 10 * time.Millisecond // we don't have all day!

		mocked.On("data", fmt.Sprintf("%s/info_0", path)).Return(newTestMasterInfo(0), nil)
		mocked.On("data", fmt.Sprintf("%s/info_005", path)).Return(newTestMasterInfo(5), nil)
		mocked.On("data", fmt.Sprintf("%s/info_004", path)).Return(newTestMasterInfo(4), nil)
		mocked.On("data", fmt.Sprintf("%s/info_017", path)).Return(newTestMasterInfo(17), nil)

		// the first snapshot will be sent immediately and the detector will be awaiting en event.
		// cycle through some connected/disconnected events but maintain the same snapshot
		go func() {
			defer close(errs)
			for attempt := 0; attempt < ITERATIONS; attempt++ {
				snaps <- leadershipChanges[attempt]
			}
		}()
		return nil
	}

	called := 0
	lostMaster := make(chan struct{})
	expectedLeaders := []int{0, 5, 4, 17}
	leaderIdx := 0
	err = md.Detect(detector.OnMasterChanged(func(master *mesos.MasterInfo) {
		called++
		log.V(3).Infof("detector invoked: called %d", called)
		switch {
		case called < EXPECTED_CALLS:
			if master != nil {
				expectedLeader := fmt.Sprintf("master(%d)@localhost:5050", expectedLeaders[leaderIdx])
				assert.Equal(t, expectedLeader, master.GetId())
				leaderIdx++
				wg.Done()
			}
		case called == EXPECTED_CALLS:
			md.Cancel()
			defer close(lostMaster)
			assert.Nil(t, master)
		default:
			t.Errorf("unexpected notification call attempt %d", called)
		}
	}))
	assert.NoError(t, err)

	fatalAfter(t, 10*time.Second, wg.Wait, "Waited too long for new-master alerts")
	fatalOn(t, 3*time.Second, lostMaster, "Waited too long for lost master")

	select {
	case <-md.Done():
		assert.Equal(t, EXPECTED_CALLS, called, "expected %d detection callbacks instead of %d", EXPECTED_CALLS, called)
	case <-time.After(time.Second * 10):
		panic("Waited too long for detector shutdown...")
	}
}
Example #5
0
func TestMasterDetectorChildrenChanged(t *testing.T) {
	md, err := NewMasterDetector(zkurl)
	defer md.Cancel()
	assert.NoError(t, err)

	path := test_zk_path
	snapDetected := make(chan struct{})
	md.bootstrapFunc = func() error {
		if md.client != nil {
			return nil
		}
		log.V(1).Infoln("bootstrapping detector")
		defer log.V(1).Infoln("bootstrapping detector ..finished")

		mocked, _, errs := newMockZkClient("info_0", "info_5", "info_10")
		md.client = mocked
		md.minDetectorCyclePeriod = 10 * time.Millisecond // we don't have all day!

		mocked.On("data", fmt.Sprintf("%s/info_0", path)).Return(newTestMasterInfo(0), nil)

		// wait for the first child snapshot to be processed before signaling end-of-watch
		// (which is signalled by closing errs).
		go func() {
			defer close(errs)
			select {
			case <-snapDetected:
			case <-md.Done():
				t.Errorf("detector died before child snapshot")
			}
		}()
		return nil
	}

	called := 0
	lostMaster := make(chan struct{})
	const expectedLeader = "master(0)@localhost:5050"
	err = md.Detect(detector.OnMasterChanged(func(master *mesos.MasterInfo) {
		//expect 2 calls in sequence: the first setting a master
		//and the second clearing it
		switch called++; called {
		case 1:
			defer close(snapDetected)
			assert.NotNil(t, master)
			assert.Equal(t, expectedLeader, master.GetId())
		case 2:
			md.Cancel()
			defer close(lostMaster)
			assert.Nil(t, master)
		default:
			t.Errorf("unexpected notification call attempt %d", called)
		}
	}))
	assert.NoError(t, err)

	fatalOn(t, 10*time.Second, lostMaster, "Waited too long for lost master")

	select {
	case <-md.Done():
		assert.Equal(t, 2, called, "expected 2 detection callbacks instead of %d", called)
	case <-time.After(time.Second * 10):
		panic("Waited too long for detector shutdown...")
	}
}