Пример #1
0
func (s *Scheduler) StartJob(job *Job) {
	log := logger.New("fn", "StartJob", "app.id", job.AppID, "release.id", job.ReleaseID, "job.type", job.Type)
	log.Info("starting job")

	for attempt := 0; attempt < maxJobAttempts; attempt++ {
		if attempt > 0 {
			time.Sleep(jobAttemptInterval)
		}
		log.Info("placing job in the cluster")
		config, host, err := s.PlaceJob(job)
		if err == ErrNotLeader {
			log.Warn("not starting job as not leader")
			return
		} else if err != nil {
			log.Error("error placing job in the cluster", "err", err)
			continue
		}

		if job.needsVolume() {
			log.Info("provisioning data volume", "host.id", host.ID)
			if err := utils.ProvisionVolume(host.client, config); err != nil {
				log.Error("error provisioning volume", "err", err)
				continue
			}
		}

		log.Info("adding job to the cluster", "host.id", host.ID, "job.id", config.ID)
		if err := host.client.AddJob(config); err != nil {
			log.Error("error adding job to the cluster", "err", err)
			continue
		}
		return
	}
	log.Error(fmt.Sprintf("error starting job after %d attempts", maxJobAttempts))
}
Пример #2
0
func (s *HostSuite) TestVolumeDeleteOnStop(t *c.C) {
	hosts, err := s.clusterClient(t).Hosts()
	t.Assert(err, c.IsNil)
	t.Assert(hosts, c.Not(c.HasLen), 0)
	h := hosts[0]

	// stream job events so we can wait for cleanup events
	events := make(chan *host.Event)
	stream, err := h.StreamEvents("all", events)
	t.Assert(err, c.IsNil)
	defer stream.Close()
	waitCleanup := func(jobID string) {
		timeout := time.After(30 * time.Second)
		for {
			select {
			case event := <-events:
				if event.JobID == jobID && event.Event == host.JobEventCleanup {
					return
				}
			case <-timeout:
				t.Fatal("timed out waiting for cleanup event")
			}
		}
	}

	for _, deleteOnStop := range []bool{true, false} {
		job := &host.Job{
			Config: host.ContainerConfig{
				Args:       []string{"sh", "-c", "ls -d /foo"},
				DisableLog: true,
			},
		}

		// provision a volume
		req := &ct.VolumeReq{Path: "/foo", DeleteOnStop: deleteOnStop}
		vol, err := utils.ProvisionVolume(req, h, job)
		t.Assert(err, c.IsNil)
		defer h.DestroyVolume(vol.ID)

		// run the job
		cmd := exec.JobUsingCluster(s.clusterClient(t), s.createArtifact(t, "test-apps"), job)
		cmd.HostID = h.ID()
		out, err := cmd.CombinedOutput()
		t.Assert(err, c.IsNil)
		t.Assert(string(out), c.Equals, "/foo\n")

		// wait for a cleanup event
		waitCleanup(job.ID)

		// check if the volume was deleted or not
		vol, err = h.GetVolume(vol.ID)
		if deleteOnStop {
			t.Assert(hh.IsObjectNotFoundError(err), c.Equals, true)
		} else {
			t.Assert(err, c.IsNil)
		}
	}
}
Пример #3
0
func (s *Scheduler) startJob(req *JobRequest) (err error) {
	log := logger.New("fn", "startJob", "job.type", req.Type)
	log.Info("starting job", "job.restarts", req.restarts, "request.attempts", req.attempts)
	s.jobs.SetState(req.JobID, JobStateStopped)
	// We'll be changing the content of the job, including the job ID,
	// so we need to copy it to prevent it from getting stale in s.jobs
	newReq := req.Clone()
	newReq.HostID = ""
	newReq.JobID = random.UUID()
	newReq.state = JobStateRequesting
	defer func() {
		if err != nil {
			if newReq.attempts >= maxJobAttempts {
				log.Error("error starting job, max job attempts reached", "err", err)
			} else {
				log.Error("error starting job, trying again", "err", err)
				newReq.attempts++
				s.jobs[newReq.JobID] = newReq.Job
				time.AfterFunc(jobAttemptInterval, func() {
					s.jobRequests <- newReq
				})
			}
		} else {
			s.jobs[newReq.JobID] = newReq.Job
		}
	}()

	log.Info("determining best host for job")
	host, err := s.findBestHost(newReq.Formation, newReq.Type)
	if err != nil {
		log.Error("error determining best host for job", "err", err)
		return err
	}
	hostID := host.ID()
	newReq.HostID = hostID

	config := jobConfig(newReq, hostID)
	newReq.JobID = config.ID

	// Provision a data volume on the host if needed.
	if newReq.needsVolume() {
		log.Info("provisioning volume")
		if err := utils.ProvisionVolume(host, config); err != nil {
			log.Error("error provisioning volume", "err", err)
			return err
		}
	}

	log.Info("requesting host to add job", "host.id", hostID, "job.id", config.ID)
	if err := host.AddJob(config); err != nil {
		log.Error("error requesting host to add job", "err", err)
		return err
	}
	return nil
}
Пример #4
0
func (s *Scheduler) StartJob(job *Job) {
	log := s.logger.New("fn", "StartJob", "app.id", job.AppID, "release.id", job.ReleaseID, "job.type", job.Type)
	log.Info("starting job")

	for attempt := 0; ; attempt++ {
		if attempt > 0 {
			// when making multiple attempts, backoff in increments
			// of 500ms (capped at 30s)
			delay := 500 * time.Millisecond * time.Duration(attempt)
			if delay > 30*time.Second {
				delay = 30 * time.Second
			}
			log.Info(fmt.Sprintf("failed to start job after %d attempts, waiting %s before trying again", attempt, delay))
			time.Sleep(delay)
		}

		log.Info("placing job in the cluster")
		config, host, err := s.PlaceJob(job)
		if err == ErrNotLeader {
			log.Warn("not starting job as not leader")
			return
		} else if err == ErrNoHostsMatchTags {
			log.Warn("unable to place job as tags don't match any hosts")
			return
		} else if err == ErrJobNotPending {
			log.Warn("unable to place job as it is no longer pending")
			return
		} else if err != nil {
			log.Error("error placing job in the cluster", "err", err)
			continue
		}

		if job.needsVolume() {
			log.Info("provisioning data volume", "host.id", host.ID)
			if err := utils.ProvisionVolume(host.client, config); err != nil {
				log.Error("error provisioning volume", "err", err)
				continue
			}
		}

		log.Info("adding job to the cluster", "host.id", host.ID, "job.id", config.ID)
		if err := host.client.AddJob(config); err != nil {
			log.Error("error adding job to the cluster", "err", err)
			continue
		}
		return
	}
}
Пример #5
0
Файл: main.go Проект: kgrz/flynn
func (f *Formation) start(typ string, hostID string) (job *Job, err error) {
	if hostID == "" {
		hosts := f.c.hosts.List()
		if len(hosts) == 0 {
			return nil, errors.New("no hosts found")
		}
		sh := make(sortHosts, 0, len(hosts))
		for _, host := range hosts {
			count := 0
			for k := range f.jobs[typ] {
				if k.hostID == host.ID() {
					count++
				}
			}
			sh = append(sh, sortHost{host.ID(), count})
		}
		sh.Sort()
		hostID = sh[0].HostID
	}
	h := f.c.hosts.Get(hostID)
	if h == nil {
		return nil, fmt.Errorf("unknown host %q", hostID)
	}

	config := f.jobConfig(typ, h.ID())

	// Provision a data volume on the host if needed.
	if f.Release.Processes[typ].Data {
		if err := utils.ProvisionVolume(h, config); err != nil {
			return nil, err
		}
	}

	job = f.jobs.Add(typ, h.ID(), config.ID)
	job.Formation = f
	f.c.jobs.Add(job)

	if err := h.AddJob(config); err != nil {
		f.jobs.Remove(job)
		f.c.jobs.Remove(config.ID, h.ID())
		return nil, err
	}
	return job, nil
}
Пример #6
0
func (c *Cmd) Start() error {
	if c.started {
		return errors.New("exec: already started")
	}
	c.done = make(chan struct{})
	c.started = true
	if c.host == nil && c.cluster == nil {
		var err error
		c.cluster = cluster.NewClient()
		if err != nil {
			return err
		}
		c.closeCluster = true
	}

	if c.HostID == "" {
		hosts, err := c.cluster.Hosts()
		if err != nil {
			return err
		}
		if len(hosts) == 0 {
			return errors.New("exec: no hosts found")
		}
		host := schedutil.PickHost(hosts)
		c.HostID = host.ID()
		c.host = host
	}

	// Use the pre-defined host.Job configuration if provided;
	// otherwise generate one from the fields on exec.Cmd that mirror stdlib's os.exec.
	if c.Job == nil {
		c.Job = &host.Job{
			Config: host.ContainerConfig{
				Args:  c.Args,
				TTY:   c.TTY,
				Env:   c.Env,
				Stdin: c.Stdin != nil || c.stdinPipe != nil,
			},
			Metadata: c.Meta,
		}
		// if attaching to stdout / stderr, avoid round tripping the
		// streams via on-disk log files.
		if c.Stdout != nil || c.Stderr != nil {
			c.Job.Config.DisableLog = true
		}
	}
	if c.Job.ID == "" {
		c.Job.ID = cluster.GenerateJobID(c.HostID, "")
	}

	if c.host == nil {
		var err error
		c.host, err = c.cluster.Host(c.HostID)
		if err != nil {
			return err
		}
	}

	for _, vol := range c.Volumes {
		if _, err := utils.ProvisionVolume(vol, c.host, c.Job); err != nil {
			return err
		}
	}

	utils.SetupMountspecs(c.Job, []*ct.Artifact{c.ImageArtifact})

	if c.Stdout != nil || c.Stderr != nil || c.Stdin != nil || c.stdinPipe != nil {
		req := &host.AttachReq{
			JobID:  c.Job.ID,
			Height: c.TermHeight,
			Width:  c.TermWidth,
			Flags:  host.AttachFlagStream,
		}
		if c.Stdout != nil {
			req.Flags |= host.AttachFlagStdout
		}
		if c.Stderr != nil {
			req.Flags |= host.AttachFlagStderr
		}
		if c.Job.Config.Stdin {
			req.Flags |= host.AttachFlagStdin
		}
		var err error
		c.attachClient, err = c.host.Attach(req, true)
		if err != nil {
			c.close()
			return err
		}
	}

	if c.stdinPipe != nil {
		c.stdinPipe.set(writeCloseCloser{c.attachClient})
	} else if c.Stdin != nil {
		go func() {
			io.Copy(c.attachClient, c.Stdin)
			c.attachClient.CloseWrite()
		}()
	}

	if c.attachClient == nil {
		c.eventChan = make(chan *host.Event)
		var err error
		c.eventStream, err = c.host.StreamEvents(c.Job.ID, c.eventChan)
		if err != nil {
			return err
		}
	}

	go func() {
		defer close(c.done)
		if c.attachClient != nil {
			c.exitStatus, c.streamErr = c.attachClient.Receive(c.Stdout, c.Stderr)
		} else {
		outer:
			for e := range c.eventChan {
				switch e.Event {
				case "stop":
					c.exitStatus = *e.Job.ExitStatus
					break outer
				case "error":
					c.streamErr = errors.New(*e.Job.Error)
					break outer
				}
			}
			c.eventStream.Close()
			if c.streamErr == nil {
				c.streamErr = c.eventStream.Err()
			}
		}
	}()

	return c.host.AddJob(c.Job)
}
Пример #7
0
func (a *RunAppAction) Run(s *State) error {
	if a.AppStep != "" {
		data, err := getAppStep(s, a.AppStep)
		if err != nil {
			return err
		}
		a.App = data.App
		procs := a.Processes
		a.ExpandedFormation = data.ExpandedFormation
		a.Processes = procs
	}
	as := &RunAppState{
		ExpandedFormation: a.ExpandedFormation,
		Resources:         make([]*resource.Resource, 0, len(a.Resources)),
		Providers:         make([]*ct.Provider, 0, len(a.Resources)),
	}
	s.StepData[a.ID] = as

	if a.App == nil {
		a.App = &ct.App{}
	}
	if a.App.ID == "" {
		a.App.ID = random.UUID()
	}
	if a.ImageArtifact == nil {
		return errors.New("bootstrap: artifact must be set")
	}
	if a.ImageArtifact.ID == "" {
		a.ImageArtifact.ID = random.UUID()
	}
	if a.Release == nil {
		return errors.New("bootstrap: release must be set")
	}
	if a.Release.ID == "" {
		a.Release.ID = random.UUID()
	}
	a.Release.ArtifactIDs = []string{a.ImageArtifact.ID}
	if a.Release.Env == nil {
		a.Release.Env = make(map[string]string)
	}
	interpolateRelease(s, a.Release)

	for _, p := range a.Resources {
		u, err := url.Parse(p.URL)
		if err != nil {
			return err
		}
		lookupDiscoverdURLHost(s, u, time.Second)
		res, err := resource.Provision(u.String(), nil)
		if err != nil {
			return err
		}
		as.Providers = append(as.Providers, p)
		as.Resources = append(as.Resources, res)
		for k, v := range res.Env {
			a.Release.Env[k] = v
		}
	}

	for typ, count := range a.Processes {
		if s.Singleton && count > 1 {
			a.Processes[typ] = 1
			count = 1
		}
		hosts := s.ShuffledHosts()
		if a.ExpandedFormation.Release.Processes[typ].Omni {
			count = len(hosts)
		}
		for i := 0; i < count; i++ {
			host := hosts[i%len(hosts)]
			config := utils.JobConfig(a.ExpandedFormation, typ, host.ID(), "")
			hostresource.SetDefaults(&config.Resources)
			if a.ExpandedFormation.Release.Processes[typ].Data {
				if err := utils.ProvisionVolume(host, config); err != nil {
					return err
				}
			}
			if err := startJob(s, host, config); err != nil {
				return err
			}
		}
	}

	return nil
}
Пример #8
0
func (f *ClusterFixer) FixPostgres() error {
	f.l.Info("checking postgres")
	service := discoverd.NewService("postgres")
	leader, _ := service.Leader()
	if leader == nil || leader.Addr == "" {
		f.l.Info("no running postgres leader")
		leader = nil
	} else {
		f.l.Info("found running postgres leader")
	}
	instances, _ := service.Instances()
	f.l.Info(fmt.Sprintf("found %d running postgres instances", len(instances)))

	f.l.Info("getting postgres status")
	var status *pgmanager.Status
	if leader != nil && leader.Addr != "" {
		client := pgmanager.NewClient(leader.Addr)
		var err error
		status, err = client.Status()
		if err != nil {
			f.l.Error("error getting status from postgres leader", "error", err)
		}
	}
	if status != nil && status.Postgres.ReadWrite {
		f.l.Info("postgres claims to be read-write")
		return nil
	}

	f.l.Info("getting postgres service metadata")
	meta, err := discoverd.NewService("postgres").GetMeta()
	if err != nil {
		return fmt.Errorf("error getting postgres state from discoverd: %s", err)
	}

	var state pgstate.State
	if err := json.Unmarshal(meta.Data, &state); err != nil {
		return fmt.Errorf("error decoding postgres state: %s", err)
	}
	if state.Primary == nil {
		return fmt.Errorf("no primary in postgres state")
	}

	f.l.Info("getting postgres primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"])
	job, host, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"])
	if err != nil {
		if state.Sync != nil {
			f.l.Error("unable to get primary job info", "error", err)
			f.l.Info("getting postgres sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"])
			job, host, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"])
			if err != nil {
				return fmt.Errorf("unable to get postgres primary or sync job details: %s", err)
			}
		} else {
			return fmt.Errorf("unable to get postgres primary job details: %s", err)
		}
	}

	if leader != nil && state.Singleton {
		return fmt.Errorf("postgres leader is running in singleton mode, unable to fix")
	}

	waitForInstance := func(jobID string) (func() (string, error), error) {
		watchCh := make(chan *discoverd.Event)
		upCh := make(chan string)
		stream, err := service.Watch(watchCh)
		if err != nil {
			return nil, fmt.Errorf("error watching discoverd service: %s", err)
		}
		go func() {
			var current bool
			for event := range watchCh {
				if event.Kind == discoverd.EventKindCurrent {
					current = true
					continue
				}
				if !current || event.Kind != discoverd.EventKindUp {
					continue
				}
				if event.Instance.Meta["FLYNN_JOB_ID"] == jobID {
					upCh <- event.Instance.Addr
				}
			}
		}()
		return func() (string, error) {
			f.l.Info("waiting for postgres instance to start", "job.id", jobID)
			defer stream.Close()
			select {
			case addr := <-upCh:
				return addr, nil
			case <-time.After(time.Minute):
				return "", fmt.Errorf("timed out waiting for postgres instance to come up")
			}
		}, nil
	}

	var wait func() (string, error)
	have := len(instances)
	want := 2
	if state.Singleton {
		want = 1
	}
	if have >= want {
		return fmt.Errorf("already have enough postgres instances, unable to fix")
	}
	f.l.Info("attempting to start missing postgres jobs", "want", want, "have", have)
	if leader == nil {
		// if no postgres, attempt to start
		job.ID = cluster.GenerateJobID(host.ID(), "")
		f.FixJobEnv(job)
		f.l.Info("starting postgres primary job", "job.id", job.ID)
		wait, err = waitForInstance(job.ID)
		if err != nil {
			return err
		}
		if err := host.AddJob(job); err != nil {
			return fmt.Errorf("error starting postgres primary job on %s: %s", host.ID(), err)
		}
		have++
	}
	if want > have {
		// if not enough postgres instances, start another
		var secondHost *cluster.Host
		for _, h := range f.hosts {
			if h.ID() != host.ID() {
				secondHost = h
				break
			}
		}
		if secondHost == nil {
			// if there are no other hosts, use the same one we put the primary on
			secondHost = host
		}
		job.ID = cluster.GenerateJobID(secondHost.ID(), "")
		f.FixJobEnv(job)
		f.l.Info("starting second postgres job", "job.id", job.ID)
		if wait == nil {
			wait, err = waitForInstance(job.ID)
			if err != nil {
				return err
			}
		}
		if err := utils.ProvisionVolume(secondHost, job); err != nil {
			return fmt.Errorf("error creating postgres volume on %s: %s", secondHost.ID(), err)
		}
		if err := secondHost.AddJob(job); err != nil {
			return fmt.Errorf("error starting additional postgres job on %s: %s", secondHost.ID(), err)
		}
	}

	if wait != nil {
		addr, err := wait()
		if err != nil {
			return err
		}
		if leader != nil {
			addr = leader.Addr
		}
		f.l.Info("waiting for postgres to come up read-write")
		return pgmanager.NewClient(addr).WaitForReadWrite(5 * time.Minute)
	}
	return nil
}
Пример #9
0
func (a *RunAppAction) Run(s *State) error {
	if a.AppStep != "" {
		data, err := getAppStep(s, a.AppStep)
		if err != nil {
			return err
		}
		a.App = data.App
		procs := a.Processes
		a.ExpandedFormation = data.ExpandedFormation
		a.Processes = procs
	}
	as := &RunAppState{
		ExpandedFormation: a.ExpandedFormation,
		Resources:         make([]*resource.Resource, 0, len(a.Resources)),
		Providers:         make([]*ct.Provider, 0, len(a.Resources)),
	}
	s.StepData[a.ID] = as

	if a.App == nil {
		a.App = &ct.App{}
	}
	if a.App.ID == "" {
		a.App.ID = random.UUID()
	}
	if a.Artifact == nil {
		return errors.New("bootstrap: artifact must be set")
	}
	if a.Artifact.ID == "" {
		a.Artifact.ID = random.UUID()
	}
	if a.Release == nil {
		return errors.New("bootstrap: release must be set")
	}
	if a.Release.ID == "" {
		a.Release.ID = random.UUID()
	}
	a.Release.ArtifactID = a.Artifact.ID
	if a.Release.Env == nil {
		a.Release.Env = make(map[string]string)
	}
	interpolateRelease(s, a.Release)

	for _, p := range a.Resources {
		u, err := url.Parse(p.URL)
		if err != nil {
			return err
		}
		lookupDiscoverdURLHost(u, time.Second)
		res, err := resource.Provision(u.String(), nil)
		if err != nil {
			return err
		}
		as.Providers = append(as.Providers, p)
		as.Resources = append(as.Resources, res)
		for k, v := range res.Env {
			a.Release.Env[k] = v
		}
	}

	cc, err := s.ClusterClient()
	if err != nil {
		return err
	}
	for typ, count := range a.Processes {
		if s.Singleton && count > 1 {
			a.Processes[typ] = 1
			count = 1
		}
		hosts, err := cc.ListHosts()
		if err != nil {
			return err
		}
		sort.Sort(schedutil.HostSlice(hosts))
		for i := 0; i < count; i++ {
			hostID := hosts[i%len(hosts)].ID
			config := utils.JobConfig(a.ExpandedFormation, typ, hostID)
			if a.ExpandedFormation.Release.Processes[typ].Data {
				if err := utils.ProvisionVolume(cc, hostID, config); err != nil {
					return err
				}
			}
			job, err := startJob(s, hostID, config)
			if err != nil {
				return err
			}
			as.Jobs = append(as.Jobs, *job)
		}
	}

	return nil
}
Пример #10
0
func (f *ClusterFixer) FixSirenia(svc string) error {
	log := f.l.New("fn", "FixSirenia", "service", svc)

	service := discoverd.NewService(svc)
	instances, _ := service.Instances()
	leader, _ := service.Leader()

	log.Info("getting service metadata")
	meta, err := service.GetMeta()
	if err != nil {
		return fmt.Errorf("error getting sirenia state from discoverd: %s", err)
	}

	var state state.State
	if err := json.Unmarshal(meta.Data, &state); err != nil {
		return fmt.Errorf("error decoding state: %s", err)
	}
	if state.Primary == nil {
		return fmt.Errorf("no primary in sirenia state")
	}

	log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"])
	primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"])
	if err != nil {
		log.Error("unable to get primary job info")
	}
	var syncJob *host.Job
	var syncHost *cluster.Host
	if state.Sync != nil {
		log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"])
		syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"])
		if err != nil {
			log.Error("unable to get sync job info")
		}
	}

	waitForInstance := func(jobID string) (func() (string, error), error) {
		watchCh := make(chan *discoverd.Event)
		upCh := make(chan string)
		stream, err := service.Watch(watchCh)
		if err != nil {
			return nil, fmt.Errorf("error watching discoverd service: %s", err)
		}
		go func() {
			var current bool
			for event := range watchCh {
				if event.Kind == discoverd.EventKindCurrent {
					current = true
					continue
				}
				if !current || event.Kind != discoverd.EventKindUp {
					continue
				}
				if event.Instance.Meta["FLYNN_JOB_ID"] == jobID {
					upCh <- event.Instance.Addr
				}
			}
		}()
		return func() (string, error) {
			log.Info("waiting for instance to start", "job.id", jobID)
			defer stream.Close()
			select {
			case addr := <-upCh:
				return addr, nil
			case <-time.After(time.Minute):
				return "", fmt.Errorf("timed out waiting for sirenia instance to come up")
			}
		}, nil
	}

	log.Info("terminating unassigned sirenia instances")
outer:
	for _, i := range instances {
		if i.Addr == state.Primary.Addr || (state.Sync != nil && i.Addr == state.Sync.Addr) {
			continue
		}
		for _, a := range state.Async {
			if i.Addr == a.Addr {
				continue outer
			}
		}
		// job not assigned in state, attempt to terminate it
		if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok {
			hostID, err := cluster.ExtractHostID(jobID)
			if err != nil {
				log.Error("error extracting host id from jobID", "jobID", jobID, "err", err)
			}
			h := f.Host(hostID)
			if h != nil {
				if err := h.StopJob(jobID); err != nil {
					log.Error("error stopping unassigned sirenia job", "jobID", jobID)
				}
			} else {
				log.Error("host not found", "hostID", hostID)
			}
		}
	}

	isRunning := func(addr string) bool {
		for _, i := range instances {
			if i.Addr == addr {
				return true
			}
		}
		return false
	}

	// if the leader isn't currently running then start it using primaryJob/primaryHost
	var wait func() (string, error)
	if !isRunning(state.Primary.Addr) {
		// if we don't have info about the primary job attempt to promote the sync
		if primaryJob == nil {
			if syncJob != nil {
				// set primary job to sync
				primaryJob = syncJob
				primaryHost = syncHost

				// nil out sync job now so we can re-allocate it.
				syncJob = nil
				syncHost = nil
			} else {
				return fmt.Errorf("neither primary or sync job info available")
			}
		}

		primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "")
		f.FixJobEnv(primaryJob)
		log.Info("starting primary job", "job.id", primaryJob.ID)
		wait, err = waitForInstance(primaryJob.ID)
		if err != nil {
			return err
		}
		if err := primaryHost.AddJob(primaryJob); err != nil {
			return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err)
		}
	}
	if !state.Singleton && !isRunning(state.Sync.Addr) {
		if syncHost == nil {
			for _, h := range f.hosts {
				if h.ID() != primaryHost.ID() {
					syncHost = h
					break
				}
			}
			if syncHost == nil {
				// if there are no other hosts, use the same one we put the primary on
				syncHost = primaryHost
			}
		}
		// if we don't have a sync job then copy the primary job
		// and provision a new volume
		if syncJob == nil {
			syncJob = primaryJob
			vol := &ct.VolumeReq{Path: "/data"}
			if _, err := utils.ProvisionVolume(vol, syncHost, syncJob); err != nil {
				return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err)
			}
		}
		syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "")
		f.FixJobEnv(syncJob)
		log.Info("starting sync job", "job.id", syncJob.ID)
		if wait == nil {
			wait, err = waitForInstance(syncJob.ID)
			if err != nil {
				return err
			}
		}
		if err := syncHost.AddJob(syncJob); err != nil {
			return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err)
		}
	}

	if wait != nil {
		addr, err := wait()
		if err != nil {
			return err
		}
		if leader != nil && leader.Addr != "" {
			addr = leader.Addr
		}
		log.Info("waiting for cluster to come up read-write", "addr", addr)
		return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute)
	}
	return nil
}
Пример #11
0
func (c *controllerAPI) RunJob(ctx context.Context, w http.ResponseWriter, req *http.Request) {
	var newJob ct.NewJob
	if err := httphelper.DecodeJSON(req, &newJob); err != nil {
		respondWithError(w, err)
		return
	}

	if err := schema.Validate(newJob); err != nil {
		respondWithError(w, err)
		return
	}

	data, err := c.releaseRepo.Get(newJob.ReleaseID)
	if err != nil {
		respondWithError(w, err)
		return
	}
	release := data.(*ct.Release)
	var artifactIDs []string
	if len(newJob.ArtifactIDs) > 0 {
		artifactIDs = newJob.ArtifactIDs
	} else if len(release.ArtifactIDs) > 0 {
		artifactIDs = release.ArtifactIDs
	} else {
		httphelper.ValidationError(w, "release.ArtifactIDs", "cannot be empty")
		return
	}

	artifacts := make([]*ct.Artifact, len(artifactIDs))
	artifactList, err := c.artifactRepo.ListIDs(artifactIDs...)
	if err != nil {
		respondWithError(w, err)
		return
	}
	for i, id := range artifactIDs {
		artifacts[i] = artifactList[id]
	}

	var entrypoint ct.ImageEntrypoint
	if e := utils.GetEntrypoint(artifacts, ""); e != nil {
		entrypoint = *e
	}

	attach := strings.Contains(req.Header.Get("Upgrade"), "flynn-attach/0")

	hosts, err := c.clusterClient.Hosts()
	if err != nil {
		respondWithError(w, err)
		return
	}
	if len(hosts) == 0 {
		respondWithError(w, errors.New("no hosts found"))
		return
	}
	client := hosts[random.Math.Intn(len(hosts))]

	uuid := random.UUID()
	hostID := client.ID()
	id := cluster.GenerateJobID(hostID, uuid)
	app := c.getApp(ctx)
	env := make(map[string]string, len(entrypoint.Env)+len(release.Env)+len(newJob.Env)+4)
	env["FLYNN_APP_ID"] = app.ID
	env["FLYNN_RELEASE_ID"] = release.ID
	env["FLYNN_PROCESS_TYPE"] = ""
	env["FLYNN_JOB_ID"] = id
	for k, v := range entrypoint.Env {
		env[k] = v
	}
	if newJob.ReleaseEnv {
		for k, v := range release.Env {
			env[k] = v
		}
	}
	for k, v := range newJob.Env {
		env[k] = v
	}
	metadata := make(map[string]string, len(newJob.Meta)+3)
	for k, v := range newJob.Meta {
		metadata[k] = v
	}
	metadata["flynn-controller.app"] = app.ID
	metadata["flynn-controller.app_name"] = app.Name
	metadata["flynn-controller.release"] = release.ID
	job := &host.Job{
		ID:       id,
		Metadata: metadata,
		Config: host.ContainerConfig{
			Args:       entrypoint.Args,
			Env:        env,
			WorkingDir: entrypoint.WorkingDir,
			Uid:        entrypoint.Uid,
			Gid:        entrypoint.Gid,
			TTY:        newJob.TTY,
			Stdin:      attach,
			DisableLog: newJob.DisableLog,
		},
		Resources: newJob.Resources,
		Partition: string(newJob.Partition),
	}
	resource.SetDefaults(&job.Resources)
	if len(newJob.Args) > 0 {
		job.Config.Args = newJob.Args
	}
	utils.SetupMountspecs(job, artifacts)

	// provision data volume if required
	if newJob.Data {
		vol := &ct.VolumeReq{Path: "/data", DeleteOnStop: true}
		if _, err := utils.ProvisionVolume(vol, client, job); err != nil {
			respondWithError(w, err)
			return
		}
	}

	var attachClient cluster.AttachClient
	if attach {
		attachReq := &host.AttachReq{
			JobID:  job.ID,
			Flags:  host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagStdin | host.AttachFlagStream,
			Height: uint16(newJob.Lines),
			Width:  uint16(newJob.Columns),
		}
		attachClient, err = client.Attach(attachReq, true)
		if err != nil {
			respondWithError(w, fmt.Errorf("attach failed: %s", err.Error()))
			return
		}
		defer attachClient.Close()
	}

	if err := client.AddJob(job); err != nil {
		respondWithError(w, fmt.Errorf("schedule failed: %s", err.Error()))
		return
	}

	if attach {
		// TODO(titanous): This Wait could block indefinitely if something goes
		// wrong, a context should be threaded in that cancels if the client
		// goes away.
		if err := attachClient.Wait(); err != nil {
			respondWithError(w, fmt.Errorf("attach wait failed: %s", err.Error()))
			return
		}
		w.Header().Set("Connection", "upgrade")
		w.Header().Set("Upgrade", "flynn-attach/0")
		w.WriteHeader(http.StatusSwitchingProtocols)
		conn, _, err := w.(http.Hijacker).Hijack()
		if err != nil {
			panic(err)
		}
		defer conn.Close()

		done := make(chan struct{}, 2)
		cp := func(to io.Writer, from io.Reader) {
			io.Copy(to, from)
			done <- struct{}{}
		}
		go cp(conn, attachClient.Conn())
		go cp(attachClient.Conn(), conn)

		// Wait for one of the connections to be closed or interrupted. EOF is
		// framed inside the attach protocol, so a read/write error indicates
		// that we're done and should clean up.
		<-done

		return
	} else {
		httphelper.JSON(w, 200, &ct.Job{
			ID:        job.ID,
			UUID:      uuid,
			HostID:    hostID,
			ReleaseID: newJob.ReleaseID,
			Args:      newJob.Args,
		})
	}
}