Example #1
func TestToyScheduler(t *testing.T) {
	var tasks = demand.Tasks{}
	tasks.Tasks = make([]*demand.Task, 1)

	task := demand.Task{Name: "anything", Demand: 8, Requested: 3}
	tasks.Tasks[0] = &task
	m := NewScheduler()


	log.Debugf("before start/stop: demand %d, requested %d, running %d", task.Demand, task.Requested, task.Running)
	err := m.StopStartTasks(&tasks)
	if err != nil {
		t.Fatalf("Error %v", err)
	log.Debugf("after start/stop: demand %d, requested %d, running %d", task.Demand, task.Requested, task.Running)

	if err != nil {
		t.Fatalf("Error. %v", err)
	} else if task.Requested != task.Demand {
		t.Fatalf("Requested should have been updated")

	err = m.CountAllTasks(&tasks)
	for name, task := range tasks.Tasks {
		if task.Running != task.Requested || task.Running != task.Demand {
			t.Fatalf("Task %s running is not what was requested or demanded", name)
		log.Debugf("after counting: demand %d, requested %d, running %d", task.Demand, task.Requested, task.Running)


Example #2
// GetDemand calculates demand for each task
func (de *LocalEngine) GetDemand(tasks *demand.Tasks, demandUpdate chan struct{}) {
	var gettingMetrics sync.WaitGroup

	// In this we need to collect the metrics, calculate demand, and trigger a demand update
	demandTimeout := time.NewTicker(constGetDemandSleep * time.Millisecond)
	for _ = range demandTimeout.C {
		log.Debug("Getting demand")

		for _, task := range tasks.Tasks {
			go func(task *demand.Task) {
				defer gettingMetrics.Done()
				log.Debugf("Getting metric for %s", task.Name)


		demandChanged := scalingCalculation(tasks)

		if demandChanged {
			demandUpdate <- struct{}{}
Example #3
// CountAllTasks for the Toy scheduler simply reflects back what has been requested
func (t *ToyScheduler) CountAllTasks(running *demand.Tasks) error {
	defer running.Unlock()

	for _, task := range running.Tasks {
		task.Running = task.Requested
	return nil
Example #4
// StopStartTasks asks the scheduler to bring the number of running tasks up to task.Demand.
func (t *ToyScheduler) StopStartTasks(tasks *demand.Tasks) error {
	defer tasks.Unlock()

	for _, task := range tasks.Tasks {
		task.Requested = task.Demand
		log.Debugf("Toy scheduler setting Requested for %s to %d", task.Name, task.Requested)

	return nil
Example #5
// cleanup resets demand for all tasks to 0 before we quit
func cleanup(s scheduler.Scheduler, tasks *demand.Tasks) {
	for _, task := range tasks.Tasks {
		task.Demand = 0

	log.Debugf("Reset tasks to 0 for cleanup")
	err := s.StopStartTasks(tasks)
	if err != nil {
		log.Errorf("Failed to cleanup tasks. %v", err)
Example #6
// StopStartTasks by calling the Marathon scaling API.
func (m *MarathonScheduler) StopStartTasks(tasks *demand.Tasks) error {
	// Create tasks if there aren't enough of them, and stop them if there are too many
	var tooMany []*demand.Task
	var tooFew []*demand.Task
	var err error

	// Check we're not already backed off. This could easily happen if we get a demand update arrive while we are in the midst
	// of a previous backoff.
	if m.backoff.Waiting() {
		log.Debug("Backoff timer still running")
		return nil

	defer tasks.Unlock()

	// TODO: Consider checking the number running before we start & stop
	for _, task := range tasks.Tasks {
		if task.Demand > task.Requested {
			// There aren't enough of these containers yet
			tooFew = append(tooFew, task)
		if task.Demand < task.Requested {
			// there aren't enough of these containers yet
			tooMany = append(tooMany, task)

	// Concatentate the two lists - scale down first to free up resources
	tasksToScale := append(tooMany, tooFew...)
	for _, task := range tasksToScale {
		blocked, err := m.stopStartTask(task)
		if blocked {
			// Marathon can't make scale changes at the moment.
			// Trigger a new scaling operation by signalling a demandUpdate after a backoff delay
			err = m.backoff.Backoff(m.demandUpdate)
			return err

		if err != nil {
			log.Errorf("Couldn't scale %s: %v ", task.Name, err)
			return err

		// Clear any backoffs on success
		log.Debugf("Now have %s: %d", task.Name, task.Requested)

	return err
Example #7
// StopStartTasks creates containers if there aren't enough of them, and stop them if there are too many
func (c *DockerScheduler) StopStartTasks(tasks *demand.Tasks) error {
	var tooMany []*demand.Task
	var tooFew []*demand.Task
	var diff int
	var err error

	defer tasks.Unlock()

	// TODO: Consider checking the number running before we start & stop
	// Don't do more scaling if this task is already changin
	for _, task := range tasks.Tasks {
		if task.Demand > task.Requested && task.Requested == task.Running {
			// There aren't enough of these containers yet
			tooFew = append(tooFew, task)

		if task.Demand < task.Requested && task.Requested == task.Running {
			// There aren't enough of these containers yet
			tooMany = append(tooMany, task)

	// Scale down first to free up resources
	for _, task := range tooMany {
		diff = task.Requested - task.Demand
		log.Infof("Stop %d of task %s", diff, task.Name)
		for i := 0; i < diff; i++ {
			err = c.stopTask(task)
			if err != nil {
				log.Errorf("Couldn't stop %s: %v ", task.Name, err)

	// Now we can scale up
	for _, task := range tooFew {
		diff = task.Demand - task.Requested
		log.Infof("Start %d of task %s", diff, task.Name)
		for i := 0; i < diff; i++ {

	// Don't return until all the scale tasks are complete
	return err
Example #8
func TestDockerScheduler(t *testing.T) {
	d := NewScheduler(true, "unix:///var/run/docker.sock")
	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {

	d.client, _ = docker.NewClient(server.URL)

	var task demand.Task
	task.Demand = 5
	task.Image = "microscaling/priority-1:latest"


	// TODO! Some Docker tests that mock out the Docker client

	var tasks demand.Tasks
	tasks.Tasks = make([]*demand.Task, 1)
	tasks.Tasks = append(tasks.Tasks, &task)
Example #9
// SendMetrics sends the current state of tasks to the API
func SendMetrics(ws *websocket.Conn, userID string, tasks *demand.Tasks) error {
	var err error
	var index int

	metrics := metrics{
		Tasks:     make([]taskMetrics, len(tasks.Tasks)),
		CreatedAt: time.Now().Unix(),

	for _, task := range tasks.Tasks {
		metrics.Tasks[index] = taskMetrics{App: task.Name, RunningCount: task.Running, PendingCount: task.Requested}

		if task.Metric != nil {
			metrics.Tasks[index].Metric = task.Metric.Current()


	payload := metricsPayload{
		User:    userID,
		Metrics: metrics,

	b, err := json.Marshal(payload)
	if err != nil {
		return fmt.Errorf("Failed to encode API json. %v", err)

	log.Debug("Sending metrics message")
	_, err = ws.Write(b)
	if err != nil {
		return fmt.Errorf("Failed to send metrics: %v", err)

	return err
Example #10
// CountAllTasks tells us how many instances of each task are currently running.
func (m *MarathonScheduler) CountAllTasks(running *demand.Tasks) error {
	var (
		err         error
		appsMessage AppsMessage

	defer running.Unlock()

	url := m.baseMarathonURL + "apps/"

	body, err := utils.GetJSON(url)
	if err != nil {
		log.Errorf("Error getting Marathon Apps %v", err)
		return err

	err = json.Unmarshal(body, &appsMessage)
	if err != nil {
		log.Errorf("Error %v unmarshalling from %s", err, string(body[:]))
		return err

	appCounts := make(map[string]int)

	// Remove leading slash from App IDs and set the instance counts.
	for _, app := range appsMessage.Apps {
		appCounts[strings.Replace(app.ID, "/", "", 1)] = app.Instances

	// Set running counts. Defaults to 0 if the App does not exist.
	tasks := running.Tasks
	for _, t := range tasks {
		t.Running = appCounts[t.Name]

	return err
Example #11
func TestServerMonitor(t *testing.T) {
	var tasks demand.Tasks
	tasks.Tasks = make([]*demand.Task, 2)

	tasks.Tasks[0] = &demand.Task{Name: "priority1", Demand: 8, Requested: 3, Running: 4}
	tasks.Tasks[1] = &demand.Task{Name: "priority2", Demand: 2, Requested: 7, Running: 5}

	server := httptest.NewServer(websocket.Handler(testServerMetrics))
	serverAddr := server.Listener.Addr().String()

	ws, err := utils.InitWebSocket(serverAddr)
	if err != nil {
		t.Fatal("dialing", err)

	s := NewServerMonitor(ws, "hello")
	if s.userID != "hello" {
		t.Fatal("Didn't set userID")

Example #12
func TestSendMetrics(t *testing.T) {
	var tasks demand.Tasks
	tasks.Tasks = make([]*demand.Task, 2)

	tasks.Tasks[0] = &demand.Task{Name: "priority1", Demand: 8, Requested: 3, Running: 4}
	tasks.Tasks[1] = &demand.Task{Name: "priority2", Demand: 2, Requested: 7, Running: 5}

	globalT = t

	for testIndex = range tests {
		server := httptest.NewServer(websocket.Handler(testServerMetrics))
		serverAddr := server.Listener.Addr().String()

		ws, err := utils.InitWebSocket(serverAddr)
		if err != nil {
			t.Fatal("dialing", err)

		SendMetrics(ws, "hello", &tasks)

Example #13
func updateTasks(dp api.DemandPayload, tasks *demand.Tasks) (demandChanged bool) {
	demandChanged = false
	defer tasks.Unlock()

	for _, taskFromServer := range dp.Demand.Tasks {
		name := taskFromServer.App

		if existingTask, err := tasks.GetTask(name); err == nil {
			if existingTask.Demand != taskFromServer.DemandCount {
				demandChanged = true
			existingTask.Demand = taskFromServer.DemandCount
	return demandChanged
Example #14
// For this simple prototype, Microscaling sits in a loop checking for demand changes every X milliseconds
func main() {
	var err error
	var tasks *demand.Tasks

	st := getSettings()

	// Sending an empty struct on this channel triggers the scheduler to make updates
	demandUpdate := make(chan struct{}, 1)

	s, err := getScheduler(st, demandUpdate)
	if err != nil {
		log.Errorf("Failed to get scheduler: %v", err)

	tasks, err = getTasks(st)
	if err != nil {
		log.Errorf("Failed to get tasks: %v", err)

	// Let the scheduler know about the task types.
	for _, task := range tasks.Tasks {
		err = s.InitScheduler(task)
		if err != nil {
			log.Errorf("Failed to start task %s: %v", task.Name, err)

	// Check if there are already any of these containers running
	err = s.CountAllTasks(tasks)
	if err != nil {
		log.Errorf("Failed to count containers. %v", err)

	// Set the initial requested counts to match what's running
	for name, task := range tasks.Tasks {
		task.Requested = task.Running
		tasks.Tasks[name] = task

	// Prepare for cleanup when we receive an interrupt
	closedown := make(chan os.Signal, 1)
	signal.Notify(closedown, os.Interrupt)
	signal.Notify(closedown, syscall.SIGTERM)

	// Open a web socket to the server TODO!! This won't always be necessary if we're not sending metrics & calculating demand locally
	ws, err := utils.InitWebSocket(st.microscalingAPI)
	if err != nil {
		log.Errorf("Failed to open web socket: %v", err)

	de, err := getDemandEngine(st, ws)
	if err != nil {
		log.Errorf("Failed to get demand engine: %v", err)

	go de.GetDemand(tasks, demandUpdate)

	// Handle demand updates
	go func() {
		for range demandUpdate {
			err = s.StopStartTasks(tasks)
			if err != nil {
				log.Errorf("Failed to stop / start tasks. %v", err)

		// When the demandUpdate channel is closed, it's time to scale everything down to 0
		cleanup(s, tasks)

	// Periodically read the current state of tasks
	getMetricsTimeout := time.NewTicker(constGetMetricsTimeout * time.Millisecond)
	go func() {
		for _ = range getMetricsTimeout.C {
			// Find out how many instances of each task are running
			err = s.CountAllTasks(tasks)
			if err != nil {
				log.Errorf("Failed to count containers. %v", err)

	// Periodically send metrics to any monitors
	monitors := getMonitors(st, ws)
	if len(monitors) > 0 {
		sendMetricsTimeout := time.NewTicker(constSendMetricsTimeout * time.Millisecond)
		go func() {
			for _ = range sendMetricsTimeout.C {
				for _, m := range monitors {
					err = m.SendMetrics(tasks)
					if err != nil {
						log.Errorf("Failed to send metrics. %v", err)

	// When we're asked to close down, we don't want to handle demand updates any more
	log.Info("Clean up when ready")
	// Give the scheduler a chance to do any necessary cleanup
	// The demand engine is responsible for closing the demandUpdate channel so that we stop
	// doing scaling operations

	exitWaitTimeout := time.NewTicker(constGetMetricsTimeout * time.Millisecond)
	for _ = range exitWaitTimeout.C {
		if tasks.Exited() {
			log.Info("All finished")
Example #15
func scalingCalculation(tasks *demand.Tasks) (demandChanged bool) {
	delta := 0
	demandChanged = false

	// Work out the ideal scale for all the services
	for _, t := range tasks.Tasks {
		t.IdealContainers = t.Running + t.Target.Delta(t.Metric.Current())
		log.Debugf("  [scale] ideal for %s priority %d would be %d. %d running, %d requested", t.Name, t.Priority, t.IdealContainers, t.Running, t.Requested)

	available := tasks.CheckCapacity()
	log.Debugf("  [scale] available space: %d", available)

	// Look for services we could scale down, in reverse priority order
	for _, t := range tasks.Tasks {
		if !t.IsScalable || t.Requested == t.MinContainers {
			// Can't scale this service down

		if t.Running != t.Requested {
			// There's a scale operation in progress
			log.Debugf("  [scale] %s already scaling: running %d, requested %d", t.Name, t.Running, t.Requested)

		// For scaling down, delta should be negative
		delta = t.ScaleDownCount()
		if delta < 0 {
			t.Demand = t.Running + delta
			demandChanged = true
			available += (-delta)
			log.Debugf("  [scale] scaling %s down by %d", t.Name, delta)

	// Now look for tasks we need to scale up
	for p, t := range tasks.Tasks {
		if !t.IsScalable {

		if t.Running != t.Requested {
			// There's a scale operation in progress
			log.Debugf("  [scale] %s already scaling: running %d, requested %d", t.Name, t.Running, t.Requested)

		delta = t.ScaleUpCount()
		if delta <= 0 {

		log.Debugf("  [scale]  would like to scale up %s by %d - available %d", t.Name, delta, available)

		if available < delta {
			// If this is a task that fills the remainder, there's no need to exceed capacity
			if !t.IsRemainder() {
				log.Debugf("  [scale] looking for %d additional capacity by scaling down:", delta-available)
				index := len(tasks.Tasks)
				freedCapacity := available
				for index > p+1 && freedCapacity < delta {
					// Kill off lower priority services if we need to
					lowerPriorityService := tasks.Tasks[index]
					if lowerPriorityService.Priority > t.Priority {
						log.Debugf("  [scale] looking for capacity from %s: running %d requested %d demand %d", lowerPriorityService.Name, lowerPriorityService.Running, lowerPriorityService.Requested, lowerPriorityService.Demand)
						scaleDownBy := lowerPriorityService.CanScaleDown()
						if scaleDownBy > 0 {
							if scaleDownBy > (delta - freedCapacity) {
								scaleDownBy = delta - freedCapacity

							lowerPriorityService.Demand = lowerPriorityService.Running - scaleDownBy
							demandChanged = true
							log.Debugf("  [scale] Service %s priority %d scaling down %d", lowerPriorityService.Name, lowerPriorityService.Priority, -scaleDownBy)
							freedCapacity = freedCapacity + scaleDownBy

			// We might still not have enough capacity and we haven't waited for scale down to complete, so just scale up what's available now
			delta = available
			log.Debugf("  [scale] Can only scale %s by %d", t.Name, delta)

		if delta > 0 {
			demandChanged = true
			available -= delta
			if t.Demand >= t.MaxContainers {
				log.Errorf("  [scale ] Limiting %s to its configured max %d", t.Name, t.MaxContainers)
				t.Demand = t.MaxContainers
			} else {
				log.Debugf("  [scale] Service %s scaling up %d", t.Name, delta)
				t.Demand = t.Running + delta
	return demandChanged
Example #16
// CountAllTasks checks how many of each task are running
func (c *DockerScheduler) CountAllTasks(running *demand.Tasks) error {
	// Docker Remote API https://docs.docker.com/reference/api/docker_remote_api_v1.20/
	// get /containers/json
	var err error
	var containers []docker.APIContainers
	containers, err = c.client.ListContainers(docker.ListContainersOptions{})
	if err != nil {
		return fmt.Errorf("Failed to list containers: %v", err)

	defer running.Unlock()
	defer c.Unlock()

	// Reset all the running counts to 0
	tasks := running.Tasks
	for _, t := range tasks {
		t.Running = 0

		for _, cc := range c.taskContainers[t.Name] {
			cc.updated = false

	var taskName string
	var present bool

	for i := range containers {
		labels := containers[i].Labels
		taskName, present = labels[labelMap]
		if present {
			// Only update tasks that are already in our task map - don't try to manage anything else
			// log.Debugf("Found a container with labels %v", labels)
			t, err := running.GetTask(taskName)
			if err != nil {
				log.Errorf("Received info about task %s that we're not managing", taskName)
			} else {
				newState := statusToState(containers[i].Status)
				id := containers[i].ID[:12]
				thisContainer, ok := c.taskContainers[taskName][id]
				if !ok {
					log.Infof("We have no previous record of container %s, state %s", id, newState)
					thisContainer = &dockerContainer{}
					c.taskContainers[taskName][id] = thisContainer

				switch newState {
				case "running":
					// We could be moving from starting to running, or it could be a container that's totally new to us
					if thisContainer.state == "starting" || thisContainer.state == "" {
						thisContainer.state = newState
				case "removing":
					if thisContainer.state != "removing" {
						log.Errorf("Container %s is being removed, but we didn't terminate it", id)
				case "exited":
					if thisContainer.state != "stopping" && thisContainer.state != "exited" {
						log.Errorf("Container %s is being removed, but we didn't terminate it", id)
				case "dead":
					if thisContainer.state != "dead" {
						log.Errorf("Container %s is dead", id)
					thisContainer.state = newState

				thisContainer.updated = true

	for _, task := range tasks {
		log.Debugf("  %s: internally running %d, requested %d", task.Name, task.Running, task.Requested)
		for id, cc := range c.taskContainers[task.Name] {
			log.Debugf("  %s - %s", id, cc.state)
			if !cc.updated {
				if cc.state == "removing" || cc.state == "exited" {
					log.Debugf("    Deleting %s", id)
					delete(c.taskContainers[task.Name], id)
				} else if cc.state != "created" && cc.state != "starting" && cc.state != "stopping" {
					log.Errorf("Bad state for container %s: %s", id, cc.state)

	return err