Example #1
0
func (s *Scheduler) findBestHost(formation *Formation, typ string) (utils.HostClient, error) {
	log := logger.New("fn", "findBestHost", "app.id", formation.App.ID, "release.id", formation.Release.ID, "job.type", typ)
	log.Info("getting host list")
	hosts, err := s.getHosts()
	if err != nil {
		log.Error("error getting host list", "err", err)
		return nil, err
	}

	counts := s.jobs.GetHostJobCounts(formation.key(), typ)
	var minCount int = math.MaxInt32
	var hostID string
	for _, host := range hosts {
		count, ok := counts[host.ID()]
		if !ok || count < minCount {
			minCount = count
			hostID = host.ID()
		}
	}
	if hostID == "" {
		return nil, fmt.Errorf("Unable to find a host out of %d host(s)", len(hosts))
	}
	log.Info(fmt.Sprintf("using host with least %s jobs", typ), "host.id", hostID)
	return s.Host(hostID)
}
Example #2
0
func (s *Scheduler) SyncHosts() (err error) {
	log := s.logger.New("fn", "SyncHosts")
	log.Info("syncing hosts")

	defer func() {
		if err != nil {
			// try again soon
			time.AfterFunc(100*time.Millisecond, s.triggerSyncHosts)
		}
	}()

	hosts, err := s.Hosts()
	if err != nil {
		log.Error("error getting hosts", "err", err)
		return err
	}

	known := make(map[string]struct{})
	var followErr error
	for _, host := range hosts {
		known[host.ID()] = struct{}{}

		h, err := s.followHost(host)
		if err == nil {
			// make sure no jobs are pending which needn't be
			s.maybeStartPendingTagJobs(h)
		} else {
			log.Error("error following host", "host.id", host.ID(), "err", err)
			// finish the sync before returning the error
			followErr = err
		}
	}

	// mark any hosts as unhealthy which are not returned from s.Hosts()
	// and are not explicitly shutdown
	for id, host := range s.hosts {
		if _, ok := known[id]; !ok && !host.shutdown {
			s.markHostAsUnhealthy(host)
		}
	}

	if followErr != nil {
		return followErr
	}

	// return an error to trigger another sync if no hosts were found
	if len(hosts) == 0 {
		log.Error(ErrNoHosts.Error())
		return ErrNoHosts
	}

	return nil
}
Example #3
0
func (s *Scheduler) SyncHosts() (err error) {
	log := logger.New("fn", "SyncHosts")
	log.Info("syncing hosts")

	defer func() {
		if err != nil {
			// try again soon
			time.AfterFunc(100*time.Millisecond, s.triggerSyncHosts)
		}
	}()

	hosts, err := s.Hosts()
	if err != nil {
		log.Error("error getting hosts", "err", err)
		return err
	}

	known := make(map[string]struct{})
	var followErr error
	for _, host := range hosts {
		known[host.ID()] = struct{}{}

		if err := s.followHost(host); err != nil {
			log.Error("error following host", "host.id", host.ID(), "err", err)
			// finish the sync before returning the error
			followErr = err
		}
	}

	// mark any hosts as unhealthy which are not returned from s.Hosts()
	for id, host := range s.hosts {
		if _, ok := known[id]; !ok {
			s.markHostAsUnhealthy(host)
		}
	}

	if followErr != nil {
		return followErr
	}

	// return an error to trigger another sync if no hosts were found
	if len(hosts) == 0 {
		e := "no hosts found"
		log.Error(e)
		return errors.New(e)
	}

	return nil
}
Example #4
0
func (s *Scheduler) startJob(req *JobRequest) (err error) {
	log := logger.New("fn", "startJob", "job.type", req.Type)
	log.Info("starting job", "job.restarts", req.restarts, "request.attempts", req.attempts)
	s.jobs.SetState(req.JobID, JobStateStopped)
	// We'll be changing the content of the job, including the job ID,
	// so we need to copy it to prevent it from getting stale in s.jobs
	newReq := req.Clone()
	newReq.HostID = ""
	newReq.JobID = random.UUID()
	newReq.state = JobStateRequesting
	defer func() {
		if err != nil {
			if newReq.attempts >= maxJobAttempts {
				log.Error("error starting job, max job attempts reached", "err", err)
			} else {
				log.Error("error starting job, trying again", "err", err)
				newReq.attempts++
				s.jobs[newReq.JobID] = newReq.Job
				time.AfterFunc(jobAttemptInterval, func() {
					s.jobRequests <- newReq
				})
			}
		} else {
			s.jobs[newReq.JobID] = newReq.Job
		}
	}()

	log.Info("determining best host for job")
	host, err := s.findBestHost(newReq.Formation, newReq.Type)
	if err != nil {
		log.Error("error determining best host for job", "err", err)
		return err
	}
	hostID := host.ID()
	newReq.HostID = hostID

	config := jobConfig(newReq, hostID)
	newReq.JobID = config.ID

	// Provision a data volume on the host if needed.
	if newReq.needsVolume() {
		log.Info("provisioning volume")
		if err := utils.ProvisionVolume(host, config); err != nil {
			log.Error("error provisioning volume", "err", err)
			return err
		}
	}

	log.Info("requesting host to add job", "host.id", hostID, "job.id", config.ID)
	if err := host.AddJob(config); err != nil {
		log.Error("error requesting host to add job", "err", err)
		return err
	}
	return nil
}