func (s *Scheduler) findBestHost(formation *Formation, typ string) (utils.HostClient, error) { log := logger.New("fn", "findBestHost", "app.id", formation.App.ID, "release.id", formation.Release.ID, "job.type", typ) log.Info("getting host list") hosts, err := s.getHosts() if err != nil { log.Error("error getting host list", "err", err) return nil, err } counts := s.jobs.GetHostJobCounts(formation.key(), typ) var minCount int = math.MaxInt32 var hostID string for _, host := range hosts { count, ok := counts[host.ID()] if !ok || count < minCount { minCount = count hostID = host.ID() } } if hostID == "" { return nil, fmt.Errorf("Unable to find a host out of %d host(s)", len(hosts)) } log.Info(fmt.Sprintf("using host with least %s jobs", typ), "host.id", hostID) return s.Host(hostID) }
func (s *Scheduler) SyncHosts() (err error) { log := s.logger.New("fn", "SyncHosts") log.Info("syncing hosts") defer func() { if err != nil { // try again soon time.AfterFunc(100*time.Millisecond, s.triggerSyncHosts) } }() hosts, err := s.Hosts() if err != nil { log.Error("error getting hosts", "err", err) return err } known := make(map[string]struct{}) var followErr error for _, host := range hosts { known[host.ID()] = struct{}{} h, err := s.followHost(host) if err == nil { // make sure no jobs are pending which needn't be s.maybeStartPendingTagJobs(h) } else { log.Error("error following host", "host.id", host.ID(), "err", err) // finish the sync before returning the error followErr = err } } // mark any hosts as unhealthy which are not returned from s.Hosts() // and are not explicitly shutdown for id, host := range s.hosts { if _, ok := known[id]; !ok && !host.shutdown { s.markHostAsUnhealthy(host) } } if followErr != nil { return followErr } // return an error to trigger another sync if no hosts were found if len(hosts) == 0 { log.Error(ErrNoHosts.Error()) return ErrNoHosts } return nil }
func (s *Scheduler) SyncHosts() (err error) { log := logger.New("fn", "SyncHosts") log.Info("syncing hosts") defer func() { if err != nil { // try again soon time.AfterFunc(100*time.Millisecond, s.triggerSyncHosts) } }() hosts, err := s.Hosts() if err != nil { log.Error("error getting hosts", "err", err) return err } known := make(map[string]struct{}) var followErr error for _, host := range hosts { known[host.ID()] = struct{}{} if err := s.followHost(host); err != nil { log.Error("error following host", "host.id", host.ID(), "err", err) // finish the sync before returning the error followErr = err } } // mark any hosts as unhealthy which are not returned from s.Hosts() for id, host := range s.hosts { if _, ok := known[id]; !ok { s.markHostAsUnhealthy(host) } } if followErr != nil { return followErr } // return an error to trigger another sync if no hosts were found if len(hosts) == 0 { e := "no hosts found" log.Error(e) return errors.New(e) } return nil }
func (s *Scheduler) startJob(req *JobRequest) (err error) { log := logger.New("fn", "startJob", "job.type", req.Type) log.Info("starting job", "job.restarts", req.restarts, "request.attempts", req.attempts) s.jobs.SetState(req.JobID, JobStateStopped) // We'll be changing the content of the job, including the job ID, // so we need to copy it to prevent it from getting stale in s.jobs newReq := req.Clone() newReq.HostID = "" newReq.JobID = random.UUID() newReq.state = JobStateRequesting defer func() { if err != nil { if newReq.attempts >= maxJobAttempts { log.Error("error starting job, max job attempts reached", "err", err) } else { log.Error("error starting job, trying again", "err", err) newReq.attempts++ s.jobs[newReq.JobID] = newReq.Job time.AfterFunc(jobAttemptInterval, func() { s.jobRequests <- newReq }) } } else { s.jobs[newReq.JobID] = newReq.Job } }() log.Info("determining best host for job") host, err := s.findBestHost(newReq.Formation, newReq.Type) if err != nil { log.Error("error determining best host for job", "err", err) return err } hostID := host.ID() newReq.HostID = hostID config := jobConfig(newReq, hostID) newReq.JobID = config.ID // Provision a data volume on the host if needed. if newReq.needsVolume() { log.Info("provisioning volume") if err := utils.ProvisionVolume(host, config); err != nil { log.Error("error provisioning volume", "err", err) return err } } log.Info("requesting host to add job", "host.id", hostID, "job.id", config.ID) if err := host.AddJob(config); err != nil { log.Error("error requesting host to add job", "err", err) return err } return nil }