func TestToyScheduler(t *testing.T) { var tasks = demand.Tasks{} tasks.Tasks = make([]*demand.Task, 1) task := demand.Task{Name: "anything", Demand: 8, Requested: 3} tasks.Tasks[0] = &task m := NewScheduler() m.InitScheduler(&task) log.Debugf("before start/stop: demand %d, requested %d, running %d", task.Demand, task.Requested, task.Running) err := m.StopStartTasks(&tasks) if err != nil { t.Fatalf("Error %v", err) } log.Debugf("after start/stop: demand %d, requested %d, running %d", task.Demand, task.Requested, task.Running) if err != nil { t.Fatalf("Error. %v", err) } else if task.Requested != task.Demand { t.Fatalf("Requested should have been updated") } err = m.CountAllTasks(&tasks) for name, task := range tasks.Tasks { if task.Running != task.Requested || task.Running != task.Demand { t.Fatalf("Task %s running is not what was requested or demanded", name) } log.Debugf("after counting: demand %d, requested %d, running %d", task.Demand, task.Requested, task.Running) } }
// GetDemand calculates demand for each task func (de *LocalEngine) GetDemand(tasks *demand.Tasks, demandUpdate chan struct{}) { var gettingMetrics sync.WaitGroup // In this we need to collect the metrics, calculate demand, and trigger a demand update demandTimeout := time.NewTicker(constGetDemandSleep * time.Millisecond) for _ = range demandTimeout.C { tasks.Lock() log.Debug("Getting demand") for _, task := range tasks.Tasks { gettingMetrics.Add(1) go func(task *demand.Task) { defer gettingMetrics.Done() log.Debugf("Getting metric for %s", task.Name) task.Metric.UpdateCurrent() }(task) } gettingMetrics.Wait() demandChanged := scalingCalculation(tasks) tasks.Unlock() if demandChanged { demandUpdate <- struct{}{} } } }
// CountAllTasks for the Toy scheduler simply reflects back what has been requested func (t *ToyScheduler) CountAllTasks(running *demand.Tasks) error { running.Lock() defer running.Unlock() for _, task := range running.Tasks { task.Running = task.Requested } return nil }
// StopStartTasks asks the scheduler to bring the number of running tasks up to task.Demand. func (t *ToyScheduler) StopStartTasks(tasks *demand.Tasks) error { tasks.Lock() defer tasks.Unlock() for _, task := range tasks.Tasks { task.Requested = task.Demand log.Debugf("Toy scheduler setting Requested for %s to %d", task.Name, task.Requested) } return nil }
// cleanup resets demand for all tasks to 0 before we quit func cleanup(s scheduler.Scheduler, tasks *demand.Tasks) { tasks.Lock() for _, task := range tasks.Tasks { task.Demand = 0 } tasks.Unlock() log.Debugf("Reset tasks to 0 for cleanup") err := s.StopStartTasks(tasks) if err != nil { log.Errorf("Failed to cleanup tasks. %v", err) } }
// StopStartTasks by calling the Marathon scaling API. func (m *MarathonScheduler) StopStartTasks(tasks *demand.Tasks) error { // Create tasks if there aren't enough of them, and stop them if there are too many var tooMany []*demand.Task var tooFew []*demand.Task var err error // Check we're not already backed off. This could easily happen if we get a demand update arrive while we are in the midst // of a previous backoff. if m.backoff.Waiting() { log.Debug("Backoff timer still running") return nil } tasks.Lock() defer tasks.Unlock() // TODO: Consider checking the number running before we start & stop for _, task := range tasks.Tasks { if task.Demand > task.Requested { // There aren't enough of these containers yet tooFew = append(tooFew, task) } if task.Demand < task.Requested { // there aren't enough of these containers yet tooMany = append(tooMany, task) } } // Concatentate the two lists - scale down first to free up resources tasksToScale := append(tooMany, tooFew...) for _, task := range tasksToScale { blocked, err := m.stopStartTask(task) if blocked { // Marathon can't make scale changes at the moment. // Trigger a new scaling operation by signalling a demandUpdate after a backoff delay err = m.backoff.Backoff(m.demandUpdate) return err } if err != nil { log.Errorf("Couldn't scale %s: %v ", task.Name, err) return err } // Clear any backoffs on success m.backoff.Reset() log.Debugf("Now have %s: %d", task.Name, task.Requested) } return err }
// StopStartTasks creates containers if there aren't enough of them, and stop them if there are too many func (c *DockerScheduler) StopStartTasks(tasks *demand.Tasks) error { var tooMany []*demand.Task var tooFew []*demand.Task var diff int var err error tasks.Lock() defer tasks.Unlock() // TODO: Consider checking the number running before we start & stop // Don't do more scaling if this task is already changin for _, task := range tasks.Tasks { if task.Demand > task.Requested && task.Requested == task.Running { // There aren't enough of these containers yet tooFew = append(tooFew, task) } if task.Demand < task.Requested && task.Requested == task.Running { // There aren't enough of these containers yet tooMany = append(tooMany, task) } } // Scale down first to free up resources for _, task := range tooMany { diff = task.Requested - task.Demand log.Infof("Stop %d of task %s", diff, task.Name) for i := 0; i < diff; i++ { err = c.stopTask(task) if err != nil { log.Errorf("Couldn't stop %s: %v ", task.Name, err) } task.Requested-- } } // Now we can scale up for _, task := range tooFew { diff = task.Demand - task.Requested log.Infof("Start %d of task %s", diff, task.Name) for i := 0; i < diff; i++ { c.startTask(task) task.Requested++ } } // Don't return until all the scale tasks are complete scaling.Wait() return err }
func TestDockerScheduler(t *testing.T) { d := NewScheduler(true, "unix:///var/run/docker.sock") server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { })) d.client, _ = docker.NewClient(server.URL) var task demand.Task task.Demand = 5 task.Image = "microscaling/priority-1:latest" d.InitScheduler(&task) d.startTask(&task) // TODO! Some Docker tests that mock out the Docker client var tasks demand.Tasks tasks.Tasks = make([]*demand.Task, 1) tasks.Tasks = append(tasks.Tasks, &task) d.CountAllTasks(&tasks) }
// SendMetrics sends the current state of tasks to the API func SendMetrics(ws *websocket.Conn, userID string, tasks *demand.Tasks) error { var err error var index int metrics := metrics{ Tasks: make([]taskMetrics, len(tasks.Tasks)), CreatedAt: time.Now().Unix(), } tasks.Lock() for _, task := range tasks.Tasks { metrics.Tasks[index] = taskMetrics{App: task.Name, RunningCount: task.Running, PendingCount: task.Requested} if task.Metric != nil { metrics.Tasks[index].Metric = task.Metric.Current() } index++ } tasks.Unlock() payload := metricsPayload{ User: userID, Metrics: metrics, } b, err := json.Marshal(payload) if err != nil { return fmt.Errorf("Failed to encode API json. %v", err) } log.Debug("Sending metrics message") _, err = ws.Write(b) if err != nil { return fmt.Errorf("Failed to send metrics: %v", err) } return err }
// CountAllTasks tells us how many instances of each task are currently running. func (m *MarathonScheduler) CountAllTasks(running *demand.Tasks) error { var ( err error appsMessage AppsMessage ) running.Lock() defer running.Unlock() url := m.baseMarathonURL + "apps/" body, err := utils.GetJSON(url) if err != nil { log.Errorf("Error getting Marathon Apps %v", err) return err } err = json.Unmarshal(body, &appsMessage) if err != nil { log.Errorf("Error %v unmarshalling from %s", err, string(body[:])) return err } appCounts := make(map[string]int) // Remove leading slash from App IDs and set the instance counts. for _, app := range appsMessage.Apps { appCounts[strings.Replace(app.ID, "/", "", 1)] = app.Instances } // Set running counts. Defaults to 0 if the App does not exist. tasks := running.Tasks for _, t := range tasks { t.Running = appCounts[t.Name] } return err }
func TestServerMonitor(t *testing.T) { var tasks demand.Tasks tasks.Tasks = make([]*demand.Task, 2) tasks.Tasks[0] = &demand.Task{Name: "priority1", Demand: 8, Requested: 3, Running: 4} tasks.Tasks[1] = &demand.Task{Name: "priority2", Demand: 2, Requested: 7, Running: 5} server := httptest.NewServer(websocket.Handler(testServerMetrics)) serverAddr := server.Listener.Addr().String() ws, err := utils.InitWebSocket(serverAddr) if err != nil { t.Fatal("dialing", err) } s := NewServerMonitor(ws, "hello") if s.userID != "hello" { t.Fatal("Didn't set userID") } s.SendMetrics(&tasks) ws.Close() server.Close() }
func TestSendMetrics(t *testing.T) { var tasks demand.Tasks tasks.Tasks = make([]*demand.Task, 2) tasks.Tasks[0] = &demand.Task{Name: "priority1", Demand: 8, Requested: 3, Running: 4} tasks.Tasks[1] = &demand.Task{Name: "priority2", Demand: 2, Requested: 7, Running: 5} globalT = t for testIndex = range tests { server := httptest.NewServer(websocket.Handler(testServerMetrics)) serverAddr := server.Listener.Addr().String() ws, err := utils.InitWebSocket(serverAddr) if err != nil { t.Fatal("dialing", err) } SendMetrics(ws, "hello", &tasks) ws.Close() server.Close() } }
func updateTasks(dp api.DemandPayload, tasks *demand.Tasks) (demandChanged bool) { demandChanged = false tasks.Lock() defer tasks.Unlock() for _, taskFromServer := range dp.Demand.Tasks { name := taskFromServer.App if existingTask, err := tasks.GetTask(name); err == nil { if existingTask.Demand != taskFromServer.DemandCount { demandChanged = true } existingTask.Demand = taskFromServer.DemandCount } } return demandChanged }
// For this simple prototype, Microscaling sits in a loop checking for demand changes every X milliseconds func main() { var err error var tasks *demand.Tasks st := getSettings() // Sending an empty struct on this channel triggers the scheduler to make updates demandUpdate := make(chan struct{}, 1) s, err := getScheduler(st, demandUpdate) if err != nil { log.Errorf("Failed to get scheduler: %v", err) return } tasks, err = getTasks(st) if err != nil { log.Errorf("Failed to get tasks: %v", err) return } // Let the scheduler know about the task types. for _, task := range tasks.Tasks { err = s.InitScheduler(task) if err != nil { log.Errorf("Failed to start task %s: %v", task.Name, err) return } } // Check if there are already any of these containers running err = s.CountAllTasks(tasks) if err != nil { log.Errorf("Failed to count containers. %v", err) } // Set the initial requested counts to match what's running for name, task := range tasks.Tasks { task.Requested = task.Running tasks.Tasks[name] = task } // Prepare for cleanup when we receive an interrupt closedown := make(chan os.Signal, 1) signal.Notify(closedown, os.Interrupt) signal.Notify(closedown, syscall.SIGTERM) // Open a web socket to the server TODO!! This won't always be necessary if we're not sending metrics & calculating demand locally ws, err := utils.InitWebSocket(st.microscalingAPI) if err != nil { log.Errorf("Failed to open web socket: %v", err) return } de, err := getDemandEngine(st, ws) if err != nil { log.Errorf("Failed to get demand engine: %v", err) return } go de.GetDemand(tasks, demandUpdate) // Handle demand updates go func() { for range demandUpdate { err = s.StopStartTasks(tasks) if err != nil { log.Errorf("Failed to stop / start tasks. %v", err) } } // When the demandUpdate channel is closed, it's time to scale everything down to 0 cleanup(s, tasks) }() // Periodically read the current state of tasks getMetricsTimeout := time.NewTicker(constGetMetricsTimeout * time.Millisecond) go func() { for _ = range getMetricsTimeout.C { // Find out how many instances of each task are running err = s.CountAllTasks(tasks) if err != nil { log.Errorf("Failed to count containers. %v", err) } } }() // Periodically send metrics to any monitors monitors := getMonitors(st, ws) if len(monitors) > 0 { sendMetricsTimeout := time.NewTicker(constSendMetricsTimeout * time.Millisecond) go func() { for _ = range sendMetricsTimeout.C { for _, m := range monitors { err = m.SendMetrics(tasks) if err != nil { log.Errorf("Failed to send metrics. %v", err) } } } }() } // When we're asked to close down, we don't want to handle demand updates any more <-closedown log.Info("Clean up when ready") // Give the scheduler a chance to do any necessary cleanup s.Cleanup() // The demand engine is responsible for closing the demandUpdate channel so that we stop // doing scaling operations de.StopDemand(demandUpdate) exitWaitTimeout := time.NewTicker(constGetMetricsTimeout * time.Millisecond) for _ = range exitWaitTimeout.C { if tasks.Exited() { log.Info("All finished") break } } }
func scalingCalculation(tasks *demand.Tasks) (demandChanged bool) { delta := 0 demandChanged = false // Work out the ideal scale for all the services for _, t := range tasks.Tasks { t.IdealContainers = t.Running + t.Target.Delta(t.Metric.Current()) log.Debugf(" [scale] ideal for %s priority %d would be %d. %d running, %d requested", t.Name, t.Priority, t.IdealContainers, t.Running, t.Requested) } available := tasks.CheckCapacity() log.Debugf(" [scale] available space: %d", available) // Look for services we could scale down, in reverse priority order tasks.PrioritySort(true) for _, t := range tasks.Tasks { if !t.IsScalable || t.Requested == t.MinContainers { // Can't scale this service down continue } if t.Running != t.Requested { // There's a scale operation in progress log.Debugf(" [scale] %s already scaling: running %d, requested %d", t.Name, t.Running, t.Requested) continue } // For scaling down, delta should be negative delta = t.ScaleDownCount() if delta < 0 { t.Demand = t.Running + delta demandChanged = true available += (-delta) log.Debugf(" [scale] scaling %s down by %d", t.Name, delta) } } // Now look for tasks we need to scale up tasks.PrioritySort(false) for p, t := range tasks.Tasks { if !t.IsScalable { continue } if t.Running != t.Requested { // There's a scale operation in progress log.Debugf(" [scale] %s already scaling: running %d, requested %d", t.Name, t.Running, t.Requested) continue } delta = t.ScaleUpCount() if delta <= 0 { continue } log.Debugf(" [scale] would like to scale up %s by %d - available %d", t.Name, delta, available) if available < delta { // If this is a task that fills the remainder, there's no need to exceed capacity if !t.IsRemainder() { log.Debugf(" [scale] looking for %d additional capacity by scaling down:", delta-available) index := len(tasks.Tasks) freedCapacity := available for index > p+1 && freedCapacity < delta { // Kill off lower priority services if we need to index-- lowerPriorityService := tasks.Tasks[index] if lowerPriorityService.Priority > t.Priority { log.Debugf(" [scale] looking for capacity from %s: running %d requested %d demand %d", lowerPriorityService.Name, lowerPriorityService.Running, lowerPriorityService.Requested, lowerPriorityService.Demand) scaleDownBy := lowerPriorityService.CanScaleDown() if scaleDownBy > 0 { if scaleDownBy > (delta - freedCapacity) { scaleDownBy = delta - freedCapacity } lowerPriorityService.Demand = lowerPriorityService.Running - scaleDownBy demandChanged = true log.Debugf(" [scale] Service %s priority %d scaling down %d", lowerPriorityService.Name, lowerPriorityService.Priority, -scaleDownBy) freedCapacity = freedCapacity + scaleDownBy } } } } // We might still not have enough capacity and we haven't waited for scale down to complete, so just scale up what's available now delta = available log.Debugf(" [scale] Can only scale %s by %d", t.Name, delta) } if delta > 0 { demandChanged = true available -= delta if t.Demand >= t.MaxContainers { log.Errorf(" [scale ] Limiting %s to its configured max %d", t.Name, t.MaxContainers) t.Demand = t.MaxContainers } else { log.Debugf(" [scale] Service %s scaling up %d", t.Name, delta) t.Demand = t.Running + delta } } } return demandChanged }
// CountAllTasks checks how many of each task are running func (c *DockerScheduler) CountAllTasks(running *demand.Tasks) error { // Docker Remote API https://docs.docker.com/reference/api/docker_remote_api_v1.20/ // get /containers/json var err error var containers []docker.APIContainers containers, err = c.client.ListContainers(docker.ListContainersOptions{}) if err != nil { return fmt.Errorf("Failed to list containers: %v", err) } running.Lock() defer running.Unlock() c.Lock() defer c.Unlock() // Reset all the running counts to 0 tasks := running.Tasks for _, t := range tasks { t.Running = 0 for _, cc := range c.taskContainers[t.Name] { cc.updated = false } } var taskName string var present bool for i := range containers { labels := containers[i].Labels taskName, present = labels[labelMap] if present { // Only update tasks that are already in our task map - don't try to manage anything else // log.Debugf("Found a container with labels %v", labels) t, err := running.GetTask(taskName) if err != nil { log.Errorf("Received info about task %s that we're not managing", taskName) } else { newState := statusToState(containers[i].Status) id := containers[i].ID[:12] thisContainer, ok := c.taskContainers[taskName][id] if !ok { log.Infof("We have no previous record of container %s, state %s", id, newState) thisContainer = &dockerContainer{} c.taskContainers[taskName][id] = thisContainer } switch newState { case "running": t.Running++ // We could be moving from starting to running, or it could be a container that's totally new to us if thisContainer.state == "starting" || thisContainer.state == "" { thisContainer.state = newState } case "removing": if thisContainer.state != "removing" { log.Errorf("Container %s is being removed, but we didn't terminate it", id) } case "exited": if thisContainer.state != "stopping" && thisContainer.state != "exited" { log.Errorf("Container %s is being removed, but we didn't terminate it", id) } case "dead": if thisContainer.state != "dead" { log.Errorf("Container %s is dead", id) } thisContainer.state = newState } thisContainer.updated = true } } } for _, task := range tasks { log.Debugf(" %s: internally running %d, requested %d", task.Name, task.Running, task.Requested) for id, cc := range c.taskContainers[task.Name] { log.Debugf(" %s - %s", id, cc.state) if !cc.updated { if cc.state == "removing" || cc.state == "exited" { log.Debugf(" Deleting %s", id) delete(c.taskContainers[task.Name], id) } else if cc.state != "created" && cc.state != "starting" && cc.state != "stopping" { log.Errorf("Bad state for container %s: %s", id, cc.state) } } } } return err }