// GetDemand calculates demand for each task func (de *LocalEngine) GetDemand(tasks *demand.Tasks, demandUpdate chan struct{}) { var gettingMetrics sync.WaitGroup // In this we need to collect the metrics, calculate demand, and trigger a demand update demandTimeout := time.NewTicker(constGetDemandSleep * time.Millisecond) for _ = range demandTimeout.C { tasks.Lock() log.Debug("Getting demand") for _, task := range tasks.Tasks { gettingMetrics.Add(1) go func(task *demand.Task) { defer gettingMetrics.Done() log.Debugf("Getting metric for %s", task.Name) task.Metric.UpdateCurrent() }(task) } gettingMetrics.Wait() demandChanged := scalingCalculation(tasks) tasks.Unlock() if demandChanged { demandUpdate <- struct{}{} } } }
// CountAllTasks for the Toy scheduler simply reflects back what has been requested func (t *ToyScheduler) CountAllTasks(running *demand.Tasks) error { running.Lock() defer running.Unlock() for _, task := range running.Tasks { task.Running = task.Requested } return nil }
// StopStartTasks asks the scheduler to bring the number of running tasks up to task.Demand. func (t *ToyScheduler) StopStartTasks(tasks *demand.Tasks) error { tasks.Lock() defer tasks.Unlock() for _, task := range tasks.Tasks { task.Requested = task.Demand log.Debugf("Toy scheduler setting Requested for %s to %d", task.Name, task.Requested) } return nil }
// cleanup resets demand for all tasks to 0 before we quit func cleanup(s scheduler.Scheduler, tasks *demand.Tasks) { tasks.Lock() for _, task := range tasks.Tasks { task.Demand = 0 } tasks.Unlock() log.Debugf("Reset tasks to 0 for cleanup") err := s.StopStartTasks(tasks) if err != nil { log.Errorf("Failed to cleanup tasks. %v", err) } }
// StopStartTasks by calling the Marathon scaling API. func (m *MarathonScheduler) StopStartTasks(tasks *demand.Tasks) error { // Create tasks if there aren't enough of them, and stop them if there are too many var tooMany []*demand.Task var tooFew []*demand.Task var err error // Check we're not already backed off. This could easily happen if we get a demand update arrive while we are in the midst // of a previous backoff. if m.backoff.Waiting() { log.Debug("Backoff timer still running") return nil } tasks.Lock() defer tasks.Unlock() // TODO: Consider checking the number running before we start & stop for _, task := range tasks.Tasks { if task.Demand > task.Requested { // There aren't enough of these containers yet tooFew = append(tooFew, task) } if task.Demand < task.Requested { // there aren't enough of these containers yet tooMany = append(tooMany, task) } } // Concatentate the two lists - scale down first to free up resources tasksToScale := append(tooMany, tooFew...) for _, task := range tasksToScale { blocked, err := m.stopStartTask(task) if blocked { // Marathon can't make scale changes at the moment. // Trigger a new scaling operation by signalling a demandUpdate after a backoff delay err = m.backoff.Backoff(m.demandUpdate) return err } if err != nil { log.Errorf("Couldn't scale %s: %v ", task.Name, err) return err } // Clear any backoffs on success m.backoff.Reset() log.Debugf("Now have %s: %d", task.Name, task.Requested) } return err }
// StopStartTasks creates containers if there aren't enough of them, and stop them if there are too many func (c *DockerScheduler) StopStartTasks(tasks *demand.Tasks) error { var tooMany []*demand.Task var tooFew []*demand.Task var diff int var err error tasks.Lock() defer tasks.Unlock() // TODO: Consider checking the number running before we start & stop // Don't do more scaling if this task is already changin for _, task := range tasks.Tasks { if task.Demand > task.Requested && task.Requested == task.Running { // There aren't enough of these containers yet tooFew = append(tooFew, task) } if task.Demand < task.Requested && task.Requested == task.Running { // There aren't enough of these containers yet tooMany = append(tooMany, task) } } // Scale down first to free up resources for _, task := range tooMany { diff = task.Requested - task.Demand log.Infof("Stop %d of task %s", diff, task.Name) for i := 0; i < diff; i++ { err = c.stopTask(task) if err != nil { log.Errorf("Couldn't stop %s: %v ", task.Name, err) } task.Requested-- } } // Now we can scale up for _, task := range tooFew { diff = task.Demand - task.Requested log.Infof("Start %d of task %s", diff, task.Name) for i := 0; i < diff; i++ { c.startTask(task) task.Requested++ } } // Don't return until all the scale tasks are complete scaling.Wait() return err }
func updateTasks(dp api.DemandPayload, tasks *demand.Tasks) (demandChanged bool) { demandChanged = false tasks.Lock() defer tasks.Unlock() for _, taskFromServer := range dp.Demand.Tasks { name := taskFromServer.App if existingTask, err := tasks.GetTask(name); err == nil { if existingTask.Demand != taskFromServer.DemandCount { demandChanged = true } existingTask.Demand = taskFromServer.DemandCount } } return demandChanged }
// SendMetrics sends the current state of tasks to the API func SendMetrics(ws *websocket.Conn, userID string, tasks *demand.Tasks) error { var err error var index int metrics := metrics{ Tasks: make([]taskMetrics, len(tasks.Tasks)), CreatedAt: time.Now().Unix(), } tasks.Lock() for _, task := range tasks.Tasks { metrics.Tasks[index] = taskMetrics{App: task.Name, RunningCount: task.Running, PendingCount: task.Requested} if task.Metric != nil { metrics.Tasks[index].Metric = task.Metric.Current() } index++ } tasks.Unlock() payload := metricsPayload{ User: userID, Metrics: metrics, } b, err := json.Marshal(payload) if err != nil { return fmt.Errorf("Failed to encode API json. %v", err) } log.Debug("Sending metrics message") _, err = ws.Write(b) if err != nil { return fmt.Errorf("Failed to send metrics: %v", err) } return err }
// CountAllTasks tells us how many instances of each task are currently running. func (m *MarathonScheduler) CountAllTasks(running *demand.Tasks) error { var ( err error appsMessage AppsMessage ) running.Lock() defer running.Unlock() url := m.baseMarathonURL + "apps/" body, err := utils.GetJSON(url) if err != nil { log.Errorf("Error getting Marathon Apps %v", err) return err } err = json.Unmarshal(body, &appsMessage) if err != nil { log.Errorf("Error %v unmarshalling from %s", err, string(body[:])) return err } appCounts := make(map[string]int) // Remove leading slash from App IDs and set the instance counts. for _, app := range appsMessage.Apps { appCounts[strings.Replace(app.ID, "/", "", 1)] = app.Instances } // Set running counts. Defaults to 0 if the App does not exist. tasks := running.Tasks for _, t := range tasks { t.Running = appCounts[t.Name] } return err }
// CountAllTasks checks how many of each task are running func (c *DockerScheduler) CountAllTasks(running *demand.Tasks) error { // Docker Remote API https://docs.docker.com/reference/api/docker_remote_api_v1.20/ // get /containers/json var err error var containers []docker.APIContainers containers, err = c.client.ListContainers(docker.ListContainersOptions{}) if err != nil { return fmt.Errorf("Failed to list containers: %v", err) } running.Lock() defer running.Unlock() c.Lock() defer c.Unlock() // Reset all the running counts to 0 tasks := running.Tasks for _, t := range tasks { t.Running = 0 for _, cc := range c.taskContainers[t.Name] { cc.updated = false } } var taskName string var present bool for i := range containers { labels := containers[i].Labels taskName, present = labels[labelMap] if present { // Only update tasks that are already in our task map - don't try to manage anything else // log.Debugf("Found a container with labels %v", labels) t, err := running.GetTask(taskName) if err != nil { log.Errorf("Received info about task %s that we're not managing", taskName) } else { newState := statusToState(containers[i].Status) id := containers[i].ID[:12] thisContainer, ok := c.taskContainers[taskName][id] if !ok { log.Infof("We have no previous record of container %s, state %s", id, newState) thisContainer = &dockerContainer{} c.taskContainers[taskName][id] = thisContainer } switch newState { case "running": t.Running++ // We could be moving from starting to running, or it could be a container that's totally new to us if thisContainer.state == "starting" || thisContainer.state == "" { thisContainer.state = newState } case "removing": if thisContainer.state != "removing" { log.Errorf("Container %s is being removed, but we didn't terminate it", id) } case "exited": if thisContainer.state != "stopping" && thisContainer.state != "exited" { log.Errorf("Container %s is being removed, but we didn't terminate it", id) } case "dead": if thisContainer.state != "dead" { log.Errorf("Container %s is dead", id) } thisContainer.state = newState } thisContainer.updated = true } } } for _, task := range tasks { log.Debugf(" %s: internally running %d, requested %d", task.Name, task.Running, task.Requested) for id, cc := range c.taskContainers[task.Name] { log.Debugf(" %s - %s", id, cc.state) if !cc.updated { if cc.state == "removing" || cc.state == "exited" { log.Debugf(" Deleting %s", id) delete(c.taskContainers[task.Name], id) } else if cc.state != "created" && cc.state != "starting" && cc.state != "stopping" { log.Errorf("Bad state for container %s: %s", id, cc.state) } } } } return err }