// monitorReachability is a hostMonitoringFunc responsible for seeing if // hosts are reachable or not. returns a slice of any errors that occur func monitorReachability(settings *evergreen.Settings) []error { evergreen.Logger.Logf(slogger.INFO, "Running reachability checks...") // used to store any errors that occur var errors []error // fetch all hosts that have not been checked recently // (> 10 minutes ago) threshold := time.Now().Add(-ReachabilityCheckInterval) hosts, err := host.Find(host.ByNotMonitoredSince(threshold)) if err != nil { errors = append(errors, fmt.Errorf("error finding hosts not"+ " monitored recently: %v", err)) return errors } // check all of the hosts. continue on error so that other hosts can be // checked successfully for _, host := range hosts { if err := checkHostReachability(host, settings); err != nil { errors = append(errors, fmt.Errorf("error checking reachability"+ " for host %v: %v", host.Id, err)) continue } } evergreen.Logger.Logf(slogger.INFO, "Finished running host reachability checks") return errors }
// flagDecommissionedHosts is a hostFlaggingFunc to get all hosts which should // be terminated because they are decommissioned func flagDecommissionedHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { hosts, err := host.Find(host.IsDecommissioned) if err != nil { return nil, fmt.Errorf("error finding decommissioned hosts: %v", err) } return hosts, nil }
// monitorReachability is a hostMonitoringFunc responsible for seeing if // hosts are reachable or not. returns a slice of any errors that occur func monitorReachability(settings *evergreen.Settings) []error { evergreen.Logger.Logf(slogger.INFO, "Running reachability checks...") // used to store any errors that occur var errors []error // fetch all hosts that have not been checked recently // (> 10 minutes ago) threshold := time.Now().Add(-ReachabilityCheckInterval) hosts, err := host.Find(host.ByNotMonitoredSince(threshold)) if err != nil { errors = append(errors, fmt.Errorf("error finding hosts not monitored recently: %v", err)) return errors } workers := NumReachabilityWorkers if len(hosts) < workers { workers = len(hosts) } wg := sync.WaitGroup{} wg.Add(workers) hostsChan := make(chan host.Host, workers) errChan := make(chan error, workers) for i := 0; i < workers; i++ { go func() { defer wg.Done() for host := range hostsChan { if err := checkHostReachability(host, settings); err != nil { errChan <- err } } }() } errDone := make(chan struct{}) go func() { defer close(errDone) for err := range errChan { errors = append(errors, fmt.Errorf("error checking reachability: %v", err)) } }() // check all of the hosts. continue on error so that other hosts can be // checked successfully for _, host := range hosts { hostsChan <- host } close(hostsChan) wg.Wait() close(errChan) <-errDone return errors }
// Call out to the embedded CloudManager to spawn hosts. Takes in a map of // distro -> number of hosts to spawn for the distro. // Returns a map of distro -> hosts spawned, and an error if one occurs. func (s *Scheduler) spawnHosts(newHostsNeeded map[string]int) ( map[string][]host.Host, error) { // loop over the distros, spawning up the appropriate number of hosts // for each distro hostsSpawnedPerDistro := make(map[string][]host.Host) for distroId, numHostsToSpawn := range newHostsNeeded { if numHostsToSpawn == 0 { continue } hostsSpawnedPerDistro[distroId] = make([]host.Host, 0, numHostsToSpawn) for i := 0; i < numHostsToSpawn; i++ { d, err := distro.FindOne(distro.ById(distroId)) if err != nil { evergreen.Logger.Logf(slogger.ERROR, "Failed to find distro '%v': %v", distroId, err) } allDistroHosts, err := host.Find(host.ByDistroId(distroId)) if err != nil { evergreen.Logger.Logf(slogger.ERROR, "Error getting hosts for distro %v: %v", distroId, err) continue } if len(allDistroHosts) >= d.PoolSize { evergreen.Logger.Logf(slogger.ERROR, "Already at max (%v) hosts for distro '%v'", distroId, d.PoolSize) continue } cloudManager, err := providers.GetCloudManager(d.Provider, s.Settings) if err != nil { evergreen.Logger.Errorf(slogger.ERROR, "Error getting cloud manager for distro: %v", err) continue } hostOptions := cloud.HostOptions{ UserName: evergreen.User, UserHost: false, } newHost, err := cloudManager.SpawnInstance(d, hostOptions) if err != nil { evergreen.Logger.Errorf(slogger.ERROR, "Error spawning instance: %v,", err) continue } hostsSpawnedPerDistro[distroId] = append(hostsSpawnedPerDistro[distroId], *newHost) } // if none were spawned successfully if len(hostsSpawnedPerDistro[distroId]) == 0 { delete(hostsSpawnedPerDistro, distroId) } } return hostsSpawnedPerDistro, nil }
// FindAvailableHosts finds all hosts available to have a task run on them. // It fetches hosts from the database whose status is "running" and who have // no task currently being run on them. func (self *DBHostFinder) FindAvailableHosts() ([]host.Host, error) { // find and return any hosts not currently running a task availableHosts, err := host.Find(host.IsAvailableAndFree) if err != nil { return nil, err } return availableHosts, nil }
// flagUnreachableHosts is a hostFlaggingFunc to get all hosts which should // be terminated because they are unreachable func flagUnreachableHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { threshold := time.Now().Add(-1 * UnreachableCutoff) hosts, err := host.Find(host.ByUnreachableBefore(threshold)) if err != nil { return nil, fmt.Errorf("error finding hosts unreachable since before %v: %v", threshold, err) } return hosts, nil }
// FindAvailableHostsForDistro finds all hosts of a certain distro // available to have a task run on them. func (self *DBHostFinder) FindAvailableHostsForDistro(d string) ([]host.Host, error) { // find and return any hosts not currently running a task availableHosts, err := host.Find(host.ByAvailableForDistro(d)) if err != nil { return nil, err } return availableHosts, nil }
func TestCreateHostBuckets(t *testing.T) { testutil.HandleTestingErr(db.ClearCollections(host.Collection), t, "couldnt reset host") Convey("With a starting time and a minute bucket size and inserting dynamic hosts with different time frames", t, func() { now := time.Now() bucketSize := time.Duration(10) * time.Second // -20 -> 20 beforeStartHost := host.Host{Id: "beforeStartHost", CreationTime: now.Add(time.Duration(-20) * time.Second), TerminationTime: now.Add(time.Duration(20) * time.Second), Provider: "ec2"} So(beforeStartHost.Insert(), ShouldBeNil) // 80 -> 120 afterEndHost := host.Host{Id: "afterEndHost", CreationTime: now.Add(time.Duration(80) * time.Second), TerminationTime: now.Add(time.Duration(120) * time.Second), Provider: "ec2"} So(afterEndHost.Insert(), ShouldBeNil) // 20 -> 40 h1 := host.Host{Id: "h1", CreationTime: now.Add(time.Duration(20) * time.Second), TerminationTime: now.Add(time.Duration(40) * time.Second), Provider: "ec2"} So(h1.Insert(), ShouldBeNil) // 10 -> 80 h2 := host.Host{Id: "h2", CreationTime: now.Add(time.Duration(10) * time.Second), TerminationTime: now.Add(time.Duration(80) * time.Second), Provider: "ec2"} So(h2.Insert(), ShouldBeNil) // 20 -> h3 := host.Host{Id: "h3", CreationTime: now.Add(time.Duration(20) * time.Second), TerminationTime: util.ZeroTime, Provider: "ec2", Status: evergreen.HostRunning} So(h3.Insert(), ShouldBeNil) // 5 -> 7 sameBucket := host.Host{Id: "sameBucket", CreationTime: now.Add(time.Duration(5) * time.Second), TerminationTime: now.Add(time.Duration(7) * time.Second), Provider: "ec2"} So(sameBucket.Insert(), ShouldBeNil) // 5 -> 30 h4 := host.Host{Id: "h4", CreationTime: now.Add(time.Duration(5) * time.Second), TerminationTime: now.Add(time.Duration(30) * time.Second), Provider: "ec2"} So(h4.Insert(), ShouldBeNil) Convey("for three buckets of 10 seconds, should only retrieve pertinent host docs", func() { endTime := now.Add(time.Duration(30) * time.Second) hosts, err := host.Find(host.ByDynamicWithinTime(now, endTime)) So(err, ShouldBeNil) So(len(hosts), ShouldEqual, 6) frameBounds := FrameBounds{ StartTime: now, EndTime: endTime, BucketSize: bucketSize, NumberBuckets: 3, } Convey("should create the correct buckets and bucket time accordingly", func() { buckets, errors := CreateHostBuckets(hosts, frameBounds) So(errors, ShouldBeEmpty) So(len(buckets), ShouldEqual, 3) So(int(buckets[0].TotalTime.Seconds()), ShouldEqual, 17) So(int(buckets[1].TotalTime.Seconds()), ShouldEqual, 30) So(int(math.Ceil(buckets[2].TotalTime.Seconds())), ShouldEqual, 40) }) }) }) }
// flagExpiredHosts is a hostFlaggingFunc to get all user-spawned hosts // that have expired func flagExpiredHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { // fetch the expired hosts hosts, err := host.Find(host.ByExpiredSince(time.Now())) if err != nil { return nil, fmt.Errorf("error finding expired spawned hosts: %v", err) } return hosts, nil }
// flagUnprovisionedHosts is a hostFlaggingFunc to get all hosts that are // taking too long to provision func flagUnprovisionedHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { // fetch all hosts that are taking too long to provision threshold := time.Now().Add(-ProvisioningCutoff) hosts, err := host.Find(host.ByUnprovisionedSince(threshold)) if err != nil { return nil, fmt.Errorf("error finding unprovisioned hosts: %v", err) } return hosts, err }
// flagProvisioningFailedHosts is a hostFlaggingFunc to get all hosts // whose provisioning failed func flagProvisioningFailedHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { // fetch all hosts whose provisioning failed hosts, err := host.Find(host.IsProvisioningFailure) if err != nil { return nil, fmt.Errorf("error finding hosts whose provisioning"+ " failed: %v", err) } return hosts, nil }
func (uis *UIServer) modifyHosts(w http.ResponseWriter, r *http.Request) { _ = MustHaveUser(r) opts := &uiParams{} err := util.ReadJSONInto(r.Body, opts) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return } hostIds := opts.HostIds if len(hostIds) == 1 && strings.TrimSpace(hostIds[0]) == "" { http.Error(w, "No host ID's found in request", http.StatusBadRequest) return } // fetch all relevant hosts hosts, err := host.Find(host.ByIds(hostIds)) if err != nil { uis.LoggedError(w, r, http.StatusInternalServerError, fmt.Errorf("Error finding hosts: %v", err)) return } if len(hosts) == 0 { http.Error(w, "No matching hosts found.", http.StatusBadRequest) return } // determine what action needs to be taken switch opts.Action { case "updateStatus": newStatus := opts.Status if !util.SliceContains(validUpdateToStatuses, newStatus) { http.Error(w, fmt.Sprintf("Invalid status: %v", opts.Status), http.StatusBadRequest) return } numHostsUpdated := 0 for _, host := range hosts { err := host.SetStatus(newStatus) if err != nil { uis.LoggedError(w, r, http.StatusInternalServerError, fmt.Errorf("Error updating host %v", err)) return } numHostsUpdated += 1 } msg := NewSuccessFlash(fmt.Sprintf("%v host(s) status successfully updated to '%v'", numHostsUpdated, newStatus)) PushFlash(uis.CookieStore, r, w, msg) return default: http.Error(w, fmt.Sprintf("Unrecognized action: %v", opts.Action), http.StatusBadRequest) return } }
// slowProvisioningWarnings is a notificationBuilder to build any necessary // warnings about hosts that are taking a long time to provision func slowProvisioningWarnings(settings *evergreen.Settings) ([]notification, error) { evergreen.Logger.Logf(slogger.INFO, "Building warnings for hosts taking a long"+ " time to provision...") if settings.Notify.SMTP == nil { return []notification{}, fmt.Errorf("no notification emails configured") } // fetch all hosts that are taking too long to provision threshold := time.Now().Add(-slowProvisioningThreshold) hosts, err := host.Find(host.ByUnprovisionedSince(threshold)) if err != nil { return nil, fmt.Errorf("error finding unprovisioned hosts: %v", err) } // the list of warning notifications that will be returned warnings := []notification{} for _, h := range hosts { // if a warning has been sent for the host, skip it if h.Notifications[slowProvisioningWarning] { continue } evergreen.Logger.Logf(slogger.INFO, "Slow-provisioning warning needs to"+ " be sent for host %v", h.Id) // build the notification hostNotification := notification{ recipient: settings.Notify.SMTP.AdminEmail[0], subject: fmt.Sprintf("Host %v taking a long time to provision", h.Id), message: fmt.Sprintf("See %v/ui/host/%v", settings.Ui.Url, h.Id), threshold: slowProvisioningWarning, host: h, callback: func(h host.Host, s string) error { return h.SetExpirationNotification(s) }, } // add it to the final list warnings = append(warnings, hostNotification) } evergreen.Logger.Logf(slogger.INFO, "Built %v warnings about hosts taking a"+ " long time to provision", len(warnings)) return warnings, nil }
// CreateAllHostUtilizationBuckets aggregates each bucket by creating a time frame given the number of days back // and the granularity wanted (ie. days, minutes, seconds, hours) all in seconds. It returns a list of Host utilization // information for each bucket. func CreateAllHostUtilizationBuckets(daysBack, granularity int) ([]HostUtilizationBucket, error) { bounds := CalculateBounds(daysBack, granularity) // find non-static hosts dynamicHosts, err := host.Find(host.ByDynamicWithinTime(bounds.StartTime, bounds.EndTime)) if err != nil { return nil, err } // find static hosts staticHosts, err := host.Find(host.AllStatic) if err != nil { return nil, err } dynamicBuckets, _ := CreateHostBuckets(dynamicHosts, bounds) staticBuckets, _ := CreateHostBuckets(staticHosts, bounds) tasks, err := task.Find(task.ByTimeRun(bounds.StartTime, bounds.EndTime).WithFields(task.StartTimeKey, task.FinishTimeKey, task.HostIdKey)) if err != nil { return nil, err } oldTasks, err := task.FindOld(task.ByTimeRun(bounds.StartTime, bounds.EndTime)) if err != nil { return nil, err } taskBuckets, _ := CreateTaskBuckets(tasks, oldTasks, bounds) bucketData := []HostUtilizationBucket{} for i, staticBucket := range staticBuckets { b := HostUtilizationBucket{ StaticHost: staticBucket.TotalTime, DynamicHost: dynamicBuckets[i].TotalTime, Task: taskBuckets[i].TotalTime, StartTime: bounds.StartTime.Add(time.Duration(i) * bounds.BucketSize), EndTime: bounds.StartTime.Add(time.Duration(i+1) * bounds.BucketSize), } bucketData = append(bucketData, b) } return bucketData, nil }
func (uis *UIServer) getSpawnedHosts(w http.ResponseWriter, r *http.Request) { user := MustHaveUser(r) hosts, err := host.Find(host.ByUserWithRunningStatus(user.Username())) if err != nil { uis.LoggedError(w, r, http.StatusInternalServerError, fmt.Errorf("Error finding running hosts for user %v: %v", user.Username(), err)) return } uis.WriteJSON(w, http.StatusOK, hosts) }
// returns info on all of the hosts spawned by a user func (as *APIServer) hostsInfoForUser(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) user := vars["user"] hosts, err := host.Find(host.ByUserWithUnterminatedStatus(user)) if err != nil { as.LoggedError(w, r, http.StatusInternalServerError, err) return } as.WriteJSON(w, http.StatusOK, spawnResponse{Hosts: hosts}) }
// flagExcessHosts is a hostFlaggingFunc to get all hosts that push their // distros over the specified max hosts func flagExcessHosts(distros []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { // will ultimately contain all the hosts that can be terminated excessHosts := []host.Host{} // figure out the excess hosts for each distro for _, d := range distros { // fetch any hosts for the distro that count towards max hosts allHostsForDistro, err := host.Find(host.ByDistroId(d.Id)) if err != nil { return nil, fmt.Errorf("error fetching hosts for distro %v: %v", d.Id, err) } // if there are more than the specified max hosts, then terminate // some, if they are not running tasks numExcessHosts := len(allHostsForDistro) - d.PoolSize if numExcessHosts > 0 { // track how many hosts for the distro are terminated counter := 0 for _, host := range allHostsForDistro { // if the host is not dynamically spun up (and can // thus be terminated), skip it canTerminate, err := hostCanBeTerminated(host, s) if err != nil { return nil, fmt.Errorf("error checking if host %v can be terminated: %v", host.Id, err) } if !canTerminate { continue } // if the host is not running a task, it can be // safely terminated if host.RunningTask == "" { excessHosts = append(excessHosts, host) counter++ } // break if we've marked enough to be terminated if counter == numExcessHosts { break } } evergreen.Logger.Logf(slogger.INFO, "Found %v excess hosts for distro %v", counter, d.Id) } } return excessHosts, nil }
func getHostsData(includeSpawnedHosts bool) (*hostsData, error) { data := &hostsData{} // get all of the hosts var dbHosts []host.Host var err error if includeSpawnedHosts { dbHosts, err = host.Find(host.IsRunning) } else { dbHosts, err = host.Find(host.ByUserWithRunningStatus(evergreen.User)) } if err != nil { return nil, err } // convert the hosts to the ui models uiHosts := make([]uiHost, len(dbHosts)) for idx, dbHost := range dbHosts { // we only need the distro id for the hosts page dbHost.Distro = distro.Distro{Id: dbHost.Distro.Id} host := uiHost{ Host: dbHost, RunningTask: nil, } uiHosts[idx] = host // get the task running on this host task, err := model.FindTask(dbHost.RunningTask) if err != nil { return nil, err } if task != nil { uiHosts[idx].RunningTask = task } } data.Hosts = uiHosts return data, nil }
// flagIdleHosts is a hostFlaggingFunc to get all hosts which have spent too // long without running a task func flagIdleHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { // will ultimately contain all of the hosts determined to be idle idleHosts := []host.Host{} // fetch all hosts not currently running a task freeHosts, err := host.Find(host.IsFree) if err != nil { return nil, fmt.Errorf("error finding free hosts: %v", err) } // go through the hosts, and see if they have idled long enough to // be terminated for _, host := range freeHosts { // ask the host how long it has been idle idleTime := host.IdleTime() // get a cloud manager for the host cloudManager, err := providers.GetCloudManager(host.Provider, s) if err != nil { return nil, fmt.Errorf("error getting cloud manager for host %v: %v", host.Id, err) } // if the host is not dynamically spun up (and can thus be terminated), // skip it canTerminate, err := hostCanBeTerminated(host, s) if err != nil { return nil, fmt.Errorf("error checking if host %v can be terminated: %v", host.Id, err) } if !canTerminate { continue } // ask how long until the next payment for the host tilNextPayment := cloudManager.TimeTilNextPayment(&host) // current determinants for idle: // idle for at least 15 minutes and // less than 5 minutes til next payment if idleTime >= 15*time.Minute && tilNextPayment <= 5*time.Minute { idleHosts = append(idleHosts, host) } } return idleHosts, nil }
// spawnHostExpirationWarnings is a notificationBuilder to build any necessary // warnings about hosts that will be expiring soon (but haven't expired yet) func spawnHostExpirationWarnings(settings *evergreen.Settings) ([]notification, error) { evergreen.Logger.Logf(slogger.INFO, "Building spawned host expiration"+ " warnings...") // sanity check, since the thresholds are supplied in code if len(spawnWarningThresholds) == 0 { evergreen.Logger.Logf(slogger.WARN, "there are no currently set warning"+ " thresholds for spawned hosts - users will not receive emails"+ " warning them of imminent host expiration") return nil, nil } // assumed to be the first warning threshold (the least recent one) firstWarningThreshold := spawnWarningThresholds[len(spawnWarningThresholds)-1] // find all spawned hosts that have passed at least one warning threshold now := time.Now() thresholdTime := now.Add(firstWarningThreshold) hosts, err := host.Find(host.ByExpiringBetween(now, thresholdTime)) if err != nil { return nil, fmt.Errorf("error finding spawned hosts that will be"+ " expiring soon: %v", err) } // the eventual list of warning notifications to be sent warnings := []notification{} for _, h := range hosts { // figure out the most recent expiration notification threshold the host // has crossed threshold := lastWarningThresholdCrossed(&h) // for keying into the host's notifications map thresholdKey := strconv.Itoa(int(threshold.Minutes())) // if a notification has already been sent for the threshold for this // host, skip it if h.Notifications[thresholdKey] { continue } evergreen.Logger.Logf(slogger.INFO, "Warning needs to be sent for threshold"+ " '%v' for host %v", thresholdKey, h.Id) // we need to send a notification for the threshold for this host hostNotification := notification{ recipient: h.StartedBy, subject: fmt.Sprintf("%v host termination reminder", h.Distro.Id), message: fmt.Sprintf("Your %v host with id %v will be terminated"+ " at %v, in %v minutes. Visit %v to extend its lifetime.", h.Distro.Id, h.Id, h.ExpirationTime.Format(time.RFC850), h.ExpirationTime.Sub(time.Now()), settings.Ui.Url+"/ui/spawn"), threshold: thresholdKey, host: h, callback: func(h host.Host, thresholdKey string) error { return h.SetExpirationNotification(thresholdKey) }, } // add it to the list warnings = append(warnings, hostNotification) } evergreen.Logger.Logf(slogger.INFO, "Built %v warnings about imminently"+ " expiring hosts", len(warnings)) return warnings, nil }
// setupReadyHosts runs the distro setup script of all hosts that are up and reachable. func (init *HostInit) setupReadyHosts() error { // set SSH timeout duration if timeoutSecs := init.Settings.HostInit.SSHTimeoutSeconds; timeoutSecs <= 0 { evergreen.Logger.Logf(slogger.WARN, "SSH timeout set to %vs (<= 0s) using %vs instead", timeoutSecs, SSHTimeoutSeconds) } else { SSHTimeoutSeconds = timeoutSecs } // find all hosts in the uninitialized state uninitializedHosts, err := host.Find(host.IsUninitialized) if err != nil { return fmt.Errorf("error fetching uninitialized hosts: %v", err) } evergreen.Logger.Logf(slogger.DEBUG, "There are %v uninitialized hosts", len(uninitializedHosts)) // used for making sure we don't exit before a setup script is done wg := &sync.WaitGroup{} for _, h := range uninitializedHosts { // check whether or not the host is ready for its setup script to be run ready, err := init.IsHostReady(&h) if err != nil { evergreen.Logger.Logf(slogger.ERROR, "Error checking host %v for readiness: %v", h.Id, err) continue } // if the host isn't ready (for instance, it might not be up yet), skip it if !ready { evergreen.Logger.Logf(slogger.DEBUG, "Host %v not ready for setup", h.Id) continue } evergreen.Logger.Logf(slogger.INFO, "Running setup script for host %v", h.Id) // kick off the setup, in its own goroutine, so pending setups don't have // to wait for it to finish wg.Add(1) go func(h host.Host) { if err := init.ProvisionHost(&h); err != nil { evergreen.Logger.Logf(slogger.ERROR, "Error provisioning host %v: %v", h.Id, err) // notify the admins of the failure subject := fmt.Sprintf("%v Evergreen provisioning failure on %v", notify.ProvisionFailurePreface, h.Distro.Id) hostLink := fmt.Sprintf("%v/host/%v", init.Settings.Ui.Url, h.Id) message := fmt.Sprintf("Provisioning failed on %v host -- %v: see %v", h.Distro.Id, h.Id, hostLink) if err := notify.NotifyAdmins(subject, message, init.Settings); err != nil { evergreen.Logger.Errorf(slogger.ERROR, "Error sending email: %v", err) } } wg.Done() }(h) } // let all setup routines finish wg.Wait() return nil }
// Validate returns an instance of BadOptionsErr if the SpawnOptions object contains invalid // data, SpawnLimitErr if the user is already at the spawned host limit, or some other untyped // instance of Error if something fails during validation. func (sm Spawn) Validate(so Options) error { d, err := distro.FindOne(distro.ById(so.Distro)) if err != nil { return BadOptionsErr{fmt.Sprintf("Invalid dist %v", so.Distro)} } if !d.SpawnAllowed { return BadOptionsErr{fmt.Sprintf("Spawning not allowed for dist %v", so.Distro)} } // if the user already has too many active spawned hosts, deny the request activeSpawnedHosts, err := host.Find(host.ByUserWithRunningStatus(so.UserName)) if err != nil { return fmt.Errorf("Error occurred finding user's current hosts: %v", err) } if len(activeSpawnedHosts) >= MaxPerUser { return SpawnLimitErr } // validate public key rsa := "ssh-rsa" dss := "ssh-dss" isRSA := strings.HasPrefix(so.PublicKey, rsa) isDSS := strings.HasPrefix(so.PublicKey, dss) if !isRSA && !isDSS { return BadOptionsErr{"key does not start with ssh-rsa or ssh-dss"} } sections := strings.Split(so.PublicKey, " ") if len(sections) < 2 { keyType := rsa if sections[0] == dss { keyType = dss } return BadOptionsErr{fmt.Sprintf("missing space after '%v'", keyType)} } // check for valid base64 if _, err = base64.StdEncoding.DecodeString(sections[1]); err != nil { return BadOptionsErr{"key contains invalid base64 string"} } if d.UserData.File != "" { if strings.TrimSpace(so.UserData) == "" { return BadOptionsErr{} } var err error switch d.UserData.Validate { case distro.UserDataFormatFormURLEncoded: _, err = url.ParseQuery(so.UserData) case distro.UserDataFormatJSON: var out map[string]interface{} err = json.Unmarshal([]byte(so.UserData), &out) case distro.UserDataFormatYAML: var out map[string]interface{} err = yaml.Unmarshal([]byte(so.UserData), &out) } if err != nil { return BadOptionsErr{fmt.Sprintf("invalid %v: %v", d.UserData.Validate, err)} } } return nil }
// Schedule all of the tasks to be run. Works by finding all of the tasks that // are ready to be run, splitting them by distro, prioritizing them, and saving // the per-distro queues. Then determines the number of new hosts to spin up // for each distro, and spins them up. func (s *Scheduler) Schedule() error { // make sure the correct static hosts are in the database evergreen.Logger.Logf(slogger.INFO, "Updating static hosts...") err := model.UpdateStaticHosts(s.Settings) if err != nil { return fmt.Errorf("error updating static hosts: %v", err) } // find all tasks ready to be run evergreen.Logger.Logf(slogger.INFO, "Finding runnable tasks...") runnableTasks, err := s.FindRunnableTasks() if err != nil { return fmt.Errorf("Error finding runnable tasks: %v", err) } evergreen.Logger.Logf(slogger.INFO, "There are %v tasks ready to be run", len(runnableTasks)) // split the tasks by distro tasksByDistro, taskRunDistros, err := s.splitTasksByDistro(runnableTasks) if err != nil { return fmt.Errorf("Error splitting tasks by distro to run on: %v", err) } // load in all of the distros distros, err := distro.Find(distro.All) if err != nil { return fmt.Errorf("Error finding distros: %v", err) } // get the expected run duration of all runnable tasks taskExpectedDuration, err := s.GetExpectedDurations(runnableTasks) if err != nil { return fmt.Errorf("Error getting expected task durations: %v", err) } distroInputChan := make(chan distroSchedulerInput, len(distros)) // put all of the needed input for the distro scheduler into a channel to be read by the // distro scheduling loop for distroId, task := range tasksByDistro { distroInputChan <- distroSchedulerInput{ distroId: distroId, runnableTasksForDistro: task, } } // close the channel to signal that the loop reading from it can terminate close(distroInputChan) workers := runtime.NumCPU() wg := sync.WaitGroup{} wg.Add(workers) // make a channel to collect all of function results from scheduling the distros distroSchedulerResultChan := make(chan *distroSchedulerResult) // for each worker, create a new goroutine for i := 0; i < workers; i++ { go func() { defer wg.Done() // read the inputs for scheduling this distro for d := range distroInputChan { // schedule the distro res := s.scheduleDistro(d.distroId, d.runnableTasksForDistro, taskExpectedDuration) if res.err != nil { evergreen.Logger.Logf(slogger.ERROR, "%v", err) } // write the results out to a results channel distroSchedulerResultChan <- res } }() } // signal the errCollector goroutine that it can terminate it's loop // intialize a map of scheduler events schedulerEvents := map[string]event.TaskQueueInfo{} // prioritize the tasks, one distro at a time taskQueueItems := make(map[string][]model.TaskQueueItem) var errResult error go func() { for res := range distroSchedulerResultChan { if res.err != nil { errResult = fmt.Errorf("error scheduling tasks on distro %v: %v", res.distroId, err) return } schedulerEvents[res.distroId] = res.schedulerEvent taskQueueItems[res.distroId] = res.taskQueueItem } }() if errResult != nil { return errResult } // wait for the distro scheduler goroutines to complete to complete wg.Wait() // wait group has terminated so scheduler channel can be closed close(distroSchedulerResultChan) // split distros by name distrosByName := make(map[string]distro.Distro) for _, d := range distros { distrosByName[d.Id] = d } // fetch all hosts, split by distro allHosts, err := host.Find(host.IsLive) if err != nil { return fmt.Errorf("Error finding live hosts: %v", err) } // figure out all hosts we have up - per distro hostsByDistro := make(map[string][]host.Host) for _, liveHost := range allHosts { hostsByDistro[liveHost.Distro.Id] = append(hostsByDistro[liveHost.Distro.Id], liveHost) } // add the length of the host lists of hosts that are running to the event log. for distroId, hosts := range hostsByDistro { taskQueueInfo := schedulerEvents[distroId] taskQueueInfo.NumHostsRunning = len(hosts) schedulerEvents[distroId] = taskQueueInfo } // construct the data that will be needed by the host allocator hostAllocatorData := HostAllocatorData{ existingDistroHosts: hostsByDistro, distros: distrosByName, taskQueueItems: taskQueueItems, taskRunDistros: taskRunDistros, projectTaskDurations: taskExpectedDuration, } // figure out how many new hosts we need newHostsNeeded, err := s.NewHostsNeeded(hostAllocatorData, s.Settings) if err != nil { return fmt.Errorf("Error determining how many new hosts are needed: %v", err) } // spawn up the hosts hostsSpawned, err := s.spawnHosts(newHostsNeeded) if err != nil { return fmt.Errorf("Error spawning new hosts: %v", err) } if len(hostsSpawned) != 0 { evergreen.Logger.Logf(slogger.INFO, "Hosts spawned (%v total), by distro: ", len(hostsSpawned)) for distro, hosts := range hostsSpawned { evergreen.Logger.Logf(slogger.INFO, " %v ->", distro) for _, host := range hosts { evergreen.Logger.Logf(slogger.INFO, " %v", host.Id) } taskQueueInfo := schedulerEvents[distro] taskQueueInfo.NumHostsRunning += len(hosts) schedulerEvents[distro] = taskQueueInfo } } else { evergreen.Logger.Logf(slogger.INFO, "No new hosts spawned") } for d, t := range schedulerEvents { eventLog := event.SchedulerEventData{ ResourceType: event.ResourceTypeScheduler, TaskQueueInfo: t, DistroId: d, } event.LogSchedulerEvent(eventLog) } return nil }
func (uis *UIServer) allTaskQueues(w http.ResponseWriter, r *http.Request) { projCtx := MustHaveProjectContext(r) taskQueues, err := model.FindAllTaskQueues() if err != nil { uis.LoggedError(w, r, http.StatusInternalServerError, fmt.Errorf("Error finding task queues: %v", err)) return } // cached map of version id to relevant patch cachedPatches := map[string]*patch.Patch{} // convert the task queues to the ui versions uiTaskQueues := []uiTaskQueue{} for _, tQ := range taskQueues { asUI := uiTaskQueue{ Distro: tQ.Distro, Queue: []uiTaskQueueItem{}, } if len(tQ.Queue) == 0 { uiTaskQueues = append(uiTaskQueues, asUI) continue } // convert the individual task queue items taskIds := []string{} for _, item := range tQ.Queue { // cache the ids, for fetching the tasks from the db taskIds = append(taskIds, item.Id) queueItemAsUI := uiTaskQueueItem{ Id: item.Id, DisplayName: item.DisplayName, BuildVariant: item.BuildVariant, RevisionOrderNumber: item.RevisionOrderNumber, Requester: item.Requester, Revision: item.Revision, Project: item.Project, } asUI.Queue = append(asUI.Queue, queueItemAsUI) } // find all the relevant tasks tasks, err := task.Find(task.ByIds(taskIds).WithFields(task.VersionKey, task.BuildIdKey)) if err != nil { msg := fmt.Sprintf("Error finding tasks: %v", err) evergreen.Logger.Errorf(slogger.ERROR, msg) http.Error(w, msg, http.StatusInternalServerError) return } // store all of the version and build ids in the relevant task queue // items for _, task := range tasks { // this sucks, but it's because we're not guaranteed the order out // of the db for idx, queueItemAsUI := range asUI.Queue { if queueItemAsUI.Id == task.Id { queueItemAsUI.Version = task.Version queueItemAsUI.Build = task.BuildId asUI.Queue[idx] = queueItemAsUI } } } // add all of the necessary patch info into the relevant task queue // items for idx, queueItemAsUI := range asUI.Queue { if queueItemAsUI.Requester == evergreen.PatchVersionRequester { // fetch the patch, if necessary var p *patch.Patch var ok bool if p, ok = cachedPatches[queueItemAsUI.Version]; ok { queueItemAsUI.User = p.Author asUI.Queue[idx] = queueItemAsUI } else { p, err = patch.FindOne( patch.ByVersion(queueItemAsUI.Version).WithFields(patch.AuthorKey), ) if err != nil { msg := fmt.Sprintf("Error finding patch: %v", err) evergreen.Logger.Errorf(slogger.ERROR, msg) http.Error(w, msg, http.StatusInternalServerError) return } if p == nil { msg := fmt.Sprintf("Couldn't find patch for version %v", queueItemAsUI.Version) evergreen.Logger.Errorf(slogger.ERROR, msg) http.Error(w, msg, http.StatusInternalServerError) return } cachedPatches[queueItemAsUI.Version] = p } queueItemAsUI.User = p.Author asUI.Queue[idx] = queueItemAsUI } } uiTaskQueues = append(uiTaskQueues, asUI) } // add other useful statistics to view alongside queue idleHosts, err := host.Find(host.IsIdle) if err != nil { msg := fmt.Sprintf("Error finding idle hosts: %v", err) evergreen.Logger.Errorf(slogger.ERROR, msg) http.Error(w, msg, http.StatusInternalServerError) return } activeHosts, err := host.Find(host.IsLive) if err != nil { msg := fmt.Sprintf("Error finding active hosts: %v", err) evergreen.Logger.Errorf(slogger.ERROR, msg) http.Error(w, msg, http.StatusInternalServerError) return } idleStaticHostsCount := 0 for _, host := range idleHosts { if host.Provider == evergreen.HostTypeStatic { idleStaticHostsCount++ } } activeStaticHostsCount := 0 for _, host := range activeHosts { if host.Provider == evergreen.HostTypeStatic { activeStaticHostsCount++ } } hostStats := uiHostStatistics{ ActiveHosts: len(activeHosts), ActiveStaticHosts: activeStaticHostsCount, IdleHosts: len(idleHosts), IdleStaticHosts: idleStaticHostsCount, } uis.WriteHTML(w, http.StatusOK, struct { ProjectData projectContext User *user.DBUser Flashes []interface{} Data uiResourceInfo }{projCtx, GetUser(r), []interface{}{}, uiResourceInfo{uiTaskQueues, hostStats}}, "base", "task_queues.html", "base_angular.html", "menu.html") }
// spawnHostExpirationWarnings is a notificationBuilder to build any necessary // warnings about hosts that will be expiring soon (but haven't expired yet) func spawnHostExpirationWarnings(settings *evergreen.Settings) ([]notification, error) { evergreen.Logger.Logf(slogger.INFO, "Building spawned host expiration"+ " warnings...") // sanity check, since the thresholds are supplied in code if len(spawnWarningThresholds) == 0 { evergreen.Logger.Logf(slogger.WARN, "there are no currently set warning"+ " thresholds for spawned hosts - users will not receive emails"+ " warning them of imminent host expiration") return nil, nil } // assumed to be the first warning threshold (the least recent one) firstWarningThreshold := spawnWarningThresholds[len(spawnWarningThresholds)-1] // find all spawned hosts that have passed at least one warning threshold now := time.Now() thresholdTime := now.Add(firstWarningThreshold) hosts, err := host.Find(host.ByExpiringBetween(now, thresholdTime)) if err != nil { return nil, fmt.Errorf("error finding spawned hosts that will be"+ " expiring soon: %v", err) } // the eventual list of warning notifications to be sent warnings := []notification{} for _, h := range hosts { // figure out the most recent expiration notification threshold the host // has crossed threshold := lastWarningThresholdCrossed(&h) // for keying into the host's notifications map thresholdKey := strconv.Itoa(int(threshold.Minutes())) // if a notification has already been sent for the threshold for this // host, skip it if h.Notifications[thresholdKey] { continue } evergreen.Logger.Logf(slogger.INFO, "Warning needs to be sent for threshold"+ " '%v' for host %v", thresholdKey, h.Id) // fetch information about the user we are notifying userToNotify, err := user.FindOne(user.ById(h.StartedBy)) if err != nil { return nil, fmt.Errorf("error finding user to notify by Id %v: %v", h.StartedBy, err) } // if we didn't find a user (in the case of testing) set the timezone to "" // to avoid triggering a nil pointer exception timezone := "" if userToNotify != nil { timezone = userToNotify.Settings.Timezone } var expirationTimeFormatted string // use our fetched information to load proper time zone to notify the user with // (if time zone is empty, defaults to UTC) loc, err := time.LoadLocation(timezone) if err != nil { evergreen.Logger.Logf(slogger.ERROR, "Error loading timezone for email format with user_id %v: %v", userToNotify.Id, err) expirationTimeFormatted = h.ExpirationTime.Format(time.RFC1123) } else { expirationTimeFormatted = h.ExpirationTime.In(loc).Format(time.RFC1123) } // we need to send a notification for the threshold for this host hostNotification := notification{ recipient: h.StartedBy, subject: fmt.Sprintf("%v host termination reminder", h.Distro.Id), message: fmt.Sprintf("Your %v host with id %v will be terminated"+ " at %v, in %v minutes. Visit %v to extend its lifetime.", h.Distro.Id, h.Id, expirationTimeFormatted, h.ExpirationTime.Sub(time.Now()), settings.Ui.Url+"/ui/spawn"), threshold: thresholdKey, host: h, callback: func(h host.Host, thresholdKey string) error { return h.SetExpirationNotification(thresholdKey) }, } // add it to the list warnings = append(warnings, hostNotification) } evergreen.Logger.Logf(slogger.INFO, "Built %v warnings about imminently"+ " expiring hosts", len(warnings)) return warnings, nil }
// run all monitoring functions func RunAllMonitoring(settings *evergreen.Settings) error { // load in all of the distros distros, err := distro.Find(db.Q{}) if err != nil { return fmt.Errorf("error finding distros: %v", err) } // fetch the project refs, which we will use to get all of the projects projectRefs, err := model.FindAllProjectRefs() if err != nil { return fmt.Errorf("error loading in project refs: %v", err) } // turn the project refs into a map of the project id -> project projects := map[string]model.Project{} for _, ref := range projectRefs { // only monitor projects that are enabled if !ref.Enabled { continue } project, err := model.FindProject("", &ref) // continue on error to stop the whole monitoring process from // being held up if err != nil { evergreen.Logger.Logf(slogger.ERROR, "error finding project %v: %v", ref.Identifier, err) continue } if project == nil { evergreen.Logger.Logf(slogger.ERROR, "no project entry found for"+ " ref %v", ref.Identifier) continue } projects[project.Identifier] = *project } // initialize the task monitor taskMonitor := &TaskMonitor{ flaggingFuncs: defaultTaskFlaggingFuncs, } // clean up any necessary tasks errs := taskMonitor.CleanupTasks(projects) for _, err := range errs { evergreen.Logger.Logf(slogger.ERROR, "Error cleaning up tasks: %v", err) } // initialize the host monitor hostMonitor := &HostMonitor{ flaggingFuncs: defaultHostFlaggingFuncs, monitoringFuncs: defaultHostMonitoringFuncs, } // clean up any necessary hosts errs = hostMonitor.CleanupHosts(distros, settings) for _, err := range errs { evergreen.Logger.Logf(slogger.ERROR, "Error cleaning up hosts: %v", err) } // run monitoring checks errs = hostMonitor.RunMonitoringChecks(settings) for _, err := range errs { evergreen.Logger.Logf(slogger.ERROR, "Error running host monitoring checks: %v", err) } // initialize the notifier notifier := &Notifier{ notificationBuilders: defaultNotificationBuilders, } // send notifications errs = notifier.Notify(settings) for _, err := range errs { evergreen.Logger.Logf(slogger.ERROR, "Error sending notifications: %v", err) } // Do alerts for spawnhosts - collect all hosts expiring in the next 12 hours. // The trigger logic will filter out any hosts that aren't in a notification window, or have // already have alerts sent. now := time.Now() thresholdTime := now.Add(12 * time.Hour) expiringSoonHosts, err := host.Find(host.ByExpiringBetween(now, thresholdTime)) if err != nil { return err } for _, h := range expiringSoonHosts { err := alerts.RunSpawnWarningTriggers(&h) if err != nil { evergreen.Logger.Logf(slogger.ERROR, "Error queueing alert: %v", err) } } return nil }
// Schedule all of the tasks to be run. Works by finding all of the tasks that // are ready to be run, splitting them by distro, prioritizing them, and saving // the per-distro queues. Then determines the number of new hosts to spin up // for each distro, and spins them up. func (self *Scheduler) Schedule() error { // make sure the correct static hosts are in the database evergreen.Logger.Logf(slogger.INFO, "Updating static hosts...") err := model.UpdateStaticHosts(self.Settings) if err != nil { return fmt.Errorf("error updating static hosts: %v", err) } // find all tasks ready to be run evergreen.Logger.Logf(slogger.INFO, "Finding runnable tasks...") runnableTasks, err := self.FindRunnableTasks() if err != nil { return fmt.Errorf("Error finding runnable tasks: %v", err) } evergreen.Logger.Logf(slogger.INFO, "There are %v tasks ready to be run", len(runnableTasks)) // split the tasks by distro tasksByDistro, taskRunDistros, err := self.splitTasksByDistro(runnableTasks) if err != nil { return fmt.Errorf("Error splitting tasks by distro to run on: %v", err) } // load in all of the distros distros, err := distro.Find(distro.All) if err != nil { return fmt.Errorf("Error finding distros: %v", err) } taskIdToMinQueuePos := make(map[string]int) // get the expected run duration of all runnable tasks taskExpectedDuration, err := self.GetExpectedDurations(runnableTasks) if err != nil { return fmt.Errorf("Error getting expected task durations: %v", err) } // prioritize the tasks, one distro at a time taskQueueItems := make(map[string][]model.TaskQueueItem) for _, d := range distros { runnableTasksForDistro := tasksByDistro[d.Id] evergreen.Logger.Logf(slogger.INFO, "Prioritizing %v tasks for distro %v...", len(runnableTasksForDistro), d.Id) prioritizedTasks, err := self.PrioritizeTasks(self.Settings, runnableTasksForDistro) if err != nil { return fmt.Errorf("Error prioritizing tasks: %v", err) } // Update the running minimums of queue position // The value is 1-based primarily so that we can differentiate between // no value and being first in a queue for i, prioritizedTask := range prioritizedTasks { minQueuePos, ok := taskIdToMinQueuePos[prioritizedTask.Id] if ok { taskIdToMinQueuePos[prioritizedTask.Id] = int(math.Min(float64(minQueuePos), float64(i+1))) } else { taskIdToMinQueuePos[prioritizedTask.Id] = i + 1 } } // persist the queue of tasks evergreen.Logger.Logf(slogger.INFO, "Saving task queue for distro %v...", d.Id) queuedTasks, err := self.PersistTaskQueue(d.Id, prioritizedTasks, taskExpectedDuration) if err != nil { return fmt.Errorf("Error saving task queue: %v", err) } // track scheduled time for prioritized tasks err = model.SetTasksScheduledTime(prioritizedTasks, time.Now()) if err != nil { return fmt.Errorf("Error setting scheduled time for prioritized "+ "tasks: %v", err) } taskQueueItems[d.Id] = queuedTasks } err = model.UpdateMinQueuePos(taskIdToMinQueuePos) if err != nil { return fmt.Errorf("Error updating tasks with queue positions: %v", err) } // split distros by name distrosByName := make(map[string]distro.Distro) for _, d := range distros { distrosByName[d.Id] = d } // fetch all hosts, split by distro allHosts, err := host.Find(host.IsLive) if err != nil { return fmt.Errorf("Error finding live hosts: %v", err) } // figure out all hosts we have up - per distro hostsByDistro := make(map[string][]host.Host) for _, liveHost := range allHosts { hostsByDistro[liveHost.Distro.Id] = append(hostsByDistro[liveHost.Distro.Id], liveHost) } // construct the data that will be needed by the host allocator hostAllocatorData := HostAllocatorData{ existingDistroHosts: hostsByDistro, distros: distrosByName, taskQueueItems: taskQueueItems, taskRunDistros: taskRunDistros, projectTaskDurations: taskExpectedDuration, } // figure out how many new hosts we need newHostsNeeded, err := self.NewHostsNeeded(hostAllocatorData, self.Settings) if err != nil { return fmt.Errorf("Error determining how many new hosts are needed: %v", err) } // spawn up the hosts hostsSpawned, err := self.spawnHosts(newHostsNeeded) if err != nil { return fmt.Errorf("Error spawning new hosts: %v", err) } if len(hostsSpawned) != 0 { evergreen.Logger.Logf(slogger.INFO, "Hosts spawned (%v total), by distro: ", len(hostsSpawned)) for distro, hosts := range hostsSpawned { evergreen.Logger.Logf(slogger.INFO, " %v ->", distro) for _, host := range hosts { evergreen.Logger.Logf(slogger.INFO, " %v", host.Id) } } } else { evergreen.Logger.Logf(slogger.INFO, "No new hosts spawned") } return nil }