func (s *Scheduler) StartJob(job *Job) { log := logger.New("fn", "StartJob", "app.id", job.AppID, "release.id", job.ReleaseID, "job.type", job.Type) log.Info("starting job") for attempt := 0; attempt < maxJobAttempts; attempt++ { if attempt > 0 { time.Sleep(jobAttemptInterval) } log.Info("placing job in the cluster") config, host, err := s.PlaceJob(job) if err == ErrNotLeader { log.Warn("not starting job as not leader") return } else if err != nil { log.Error("error placing job in the cluster", "err", err) continue } if job.needsVolume() { log.Info("provisioning data volume", "host.id", host.ID) if err := utils.ProvisionVolume(host.client, config); err != nil { log.Error("error provisioning volume", "err", err) continue } } log.Info("adding job to the cluster", "host.id", host.ID, "job.id", config.ID) if err := host.client.AddJob(config); err != nil { log.Error("error adding job to the cluster", "err", err) continue } return } log.Error(fmt.Sprintf("error starting job after %d attempts", maxJobAttempts)) }
func (s *HostSuite) TestVolumeDeleteOnStop(t *c.C) { hosts, err := s.clusterClient(t).Hosts() t.Assert(err, c.IsNil) t.Assert(hosts, c.Not(c.HasLen), 0) h := hosts[0] // stream job events so we can wait for cleanup events events := make(chan *host.Event) stream, err := h.StreamEvents("all", events) t.Assert(err, c.IsNil) defer stream.Close() waitCleanup := func(jobID string) { timeout := time.After(30 * time.Second) for { select { case event := <-events: if event.JobID == jobID && event.Event == host.JobEventCleanup { return } case <-timeout: t.Fatal("timed out waiting for cleanup event") } } } for _, deleteOnStop := range []bool{true, false} { job := &host.Job{ Config: host.ContainerConfig{ Args: []string{"sh", "-c", "ls -d /foo"}, DisableLog: true, }, } // provision a volume req := &ct.VolumeReq{Path: "/foo", DeleteOnStop: deleteOnStop} vol, err := utils.ProvisionVolume(req, h, job) t.Assert(err, c.IsNil) defer h.DestroyVolume(vol.ID) // run the job cmd := exec.JobUsingCluster(s.clusterClient(t), s.createArtifact(t, "test-apps"), job) cmd.HostID = h.ID() out, err := cmd.CombinedOutput() t.Assert(err, c.IsNil) t.Assert(string(out), c.Equals, "/foo\n") // wait for a cleanup event waitCleanup(job.ID) // check if the volume was deleted or not vol, err = h.GetVolume(vol.ID) if deleteOnStop { t.Assert(hh.IsObjectNotFoundError(err), c.Equals, true) } else { t.Assert(err, c.IsNil) } } }
func (s *Scheduler) startJob(req *JobRequest) (err error) { log := logger.New("fn", "startJob", "job.type", req.Type) log.Info("starting job", "job.restarts", req.restarts, "request.attempts", req.attempts) s.jobs.SetState(req.JobID, JobStateStopped) // We'll be changing the content of the job, including the job ID, // so we need to copy it to prevent it from getting stale in s.jobs newReq := req.Clone() newReq.HostID = "" newReq.JobID = random.UUID() newReq.state = JobStateRequesting defer func() { if err != nil { if newReq.attempts >= maxJobAttempts { log.Error("error starting job, max job attempts reached", "err", err) } else { log.Error("error starting job, trying again", "err", err) newReq.attempts++ s.jobs[newReq.JobID] = newReq.Job time.AfterFunc(jobAttemptInterval, func() { s.jobRequests <- newReq }) } } else { s.jobs[newReq.JobID] = newReq.Job } }() log.Info("determining best host for job") host, err := s.findBestHost(newReq.Formation, newReq.Type) if err != nil { log.Error("error determining best host for job", "err", err) return err } hostID := host.ID() newReq.HostID = hostID config := jobConfig(newReq, hostID) newReq.JobID = config.ID // Provision a data volume on the host if needed. if newReq.needsVolume() { log.Info("provisioning volume") if err := utils.ProvisionVolume(host, config); err != nil { log.Error("error provisioning volume", "err", err) return err } } log.Info("requesting host to add job", "host.id", hostID, "job.id", config.ID) if err := host.AddJob(config); err != nil { log.Error("error requesting host to add job", "err", err) return err } return nil }
func (s *Scheduler) StartJob(job *Job) { log := s.logger.New("fn", "StartJob", "app.id", job.AppID, "release.id", job.ReleaseID, "job.type", job.Type) log.Info("starting job") for attempt := 0; ; attempt++ { if attempt > 0 { // when making multiple attempts, backoff in increments // of 500ms (capped at 30s) delay := 500 * time.Millisecond * time.Duration(attempt) if delay > 30*time.Second { delay = 30 * time.Second } log.Info(fmt.Sprintf("failed to start job after %d attempts, waiting %s before trying again", attempt, delay)) time.Sleep(delay) } log.Info("placing job in the cluster") config, host, err := s.PlaceJob(job) if err == ErrNotLeader { log.Warn("not starting job as not leader") return } else if err == ErrNoHostsMatchTags { log.Warn("unable to place job as tags don't match any hosts") return } else if err == ErrJobNotPending { log.Warn("unable to place job as it is no longer pending") return } else if err != nil { log.Error("error placing job in the cluster", "err", err) continue } if job.needsVolume() { log.Info("provisioning data volume", "host.id", host.ID) if err := utils.ProvisionVolume(host.client, config); err != nil { log.Error("error provisioning volume", "err", err) continue } } log.Info("adding job to the cluster", "host.id", host.ID, "job.id", config.ID) if err := host.client.AddJob(config); err != nil { log.Error("error adding job to the cluster", "err", err) continue } return } }
func (f *Formation) start(typ string, hostID string) (job *Job, err error) { if hostID == "" { hosts := f.c.hosts.List() if len(hosts) == 0 { return nil, errors.New("no hosts found") } sh := make(sortHosts, 0, len(hosts)) for _, host := range hosts { count := 0 for k := range f.jobs[typ] { if k.hostID == host.ID() { count++ } } sh = append(sh, sortHost{host.ID(), count}) } sh.Sort() hostID = sh[0].HostID } h := f.c.hosts.Get(hostID) if h == nil { return nil, fmt.Errorf("unknown host %q", hostID) } config := f.jobConfig(typ, h.ID()) // Provision a data volume on the host if needed. if f.Release.Processes[typ].Data { if err := utils.ProvisionVolume(h, config); err != nil { return nil, err } } job = f.jobs.Add(typ, h.ID(), config.ID) job.Formation = f f.c.jobs.Add(job) if err := h.AddJob(config); err != nil { f.jobs.Remove(job) f.c.jobs.Remove(config.ID, h.ID()) return nil, err } return job, nil }
func (c *Cmd) Start() error { if c.started { return errors.New("exec: already started") } c.done = make(chan struct{}) c.started = true if c.host == nil && c.cluster == nil { var err error c.cluster = cluster.NewClient() if err != nil { return err } c.closeCluster = true } if c.HostID == "" { hosts, err := c.cluster.Hosts() if err != nil { return err } if len(hosts) == 0 { return errors.New("exec: no hosts found") } host := schedutil.PickHost(hosts) c.HostID = host.ID() c.host = host } // Use the pre-defined host.Job configuration if provided; // otherwise generate one from the fields on exec.Cmd that mirror stdlib's os.exec. if c.Job == nil { c.Job = &host.Job{ Config: host.ContainerConfig{ Args: c.Args, TTY: c.TTY, Env: c.Env, Stdin: c.Stdin != nil || c.stdinPipe != nil, }, Metadata: c.Meta, } // if attaching to stdout / stderr, avoid round tripping the // streams via on-disk log files. if c.Stdout != nil || c.Stderr != nil { c.Job.Config.DisableLog = true } } if c.Job.ID == "" { c.Job.ID = cluster.GenerateJobID(c.HostID, "") } if c.host == nil { var err error c.host, err = c.cluster.Host(c.HostID) if err != nil { return err } } for _, vol := range c.Volumes { if _, err := utils.ProvisionVolume(vol, c.host, c.Job); err != nil { return err } } utils.SetupMountspecs(c.Job, []*ct.Artifact{c.ImageArtifact}) if c.Stdout != nil || c.Stderr != nil || c.Stdin != nil || c.stdinPipe != nil { req := &host.AttachReq{ JobID: c.Job.ID, Height: c.TermHeight, Width: c.TermWidth, Flags: host.AttachFlagStream, } if c.Stdout != nil { req.Flags |= host.AttachFlagStdout } if c.Stderr != nil { req.Flags |= host.AttachFlagStderr } if c.Job.Config.Stdin { req.Flags |= host.AttachFlagStdin } var err error c.attachClient, err = c.host.Attach(req, true) if err != nil { c.close() return err } } if c.stdinPipe != nil { c.stdinPipe.set(writeCloseCloser{c.attachClient}) } else if c.Stdin != nil { go func() { io.Copy(c.attachClient, c.Stdin) c.attachClient.CloseWrite() }() } if c.attachClient == nil { c.eventChan = make(chan *host.Event) var err error c.eventStream, err = c.host.StreamEvents(c.Job.ID, c.eventChan) if err != nil { return err } } go func() { defer close(c.done) if c.attachClient != nil { c.exitStatus, c.streamErr = c.attachClient.Receive(c.Stdout, c.Stderr) } else { outer: for e := range c.eventChan { switch e.Event { case "stop": c.exitStatus = *e.Job.ExitStatus break outer case "error": c.streamErr = errors.New(*e.Job.Error) break outer } } c.eventStream.Close() if c.streamErr == nil { c.streamErr = c.eventStream.Err() } } }() return c.host.AddJob(c.Job) }
func (a *RunAppAction) Run(s *State) error { if a.AppStep != "" { data, err := getAppStep(s, a.AppStep) if err != nil { return err } a.App = data.App procs := a.Processes a.ExpandedFormation = data.ExpandedFormation a.Processes = procs } as := &RunAppState{ ExpandedFormation: a.ExpandedFormation, Resources: make([]*resource.Resource, 0, len(a.Resources)), Providers: make([]*ct.Provider, 0, len(a.Resources)), } s.StepData[a.ID] = as if a.App == nil { a.App = &ct.App{} } if a.App.ID == "" { a.App.ID = random.UUID() } if a.ImageArtifact == nil { return errors.New("bootstrap: artifact must be set") } if a.ImageArtifact.ID == "" { a.ImageArtifact.ID = random.UUID() } if a.Release == nil { return errors.New("bootstrap: release must be set") } if a.Release.ID == "" { a.Release.ID = random.UUID() } a.Release.ArtifactIDs = []string{a.ImageArtifact.ID} if a.Release.Env == nil { a.Release.Env = make(map[string]string) } interpolateRelease(s, a.Release) for _, p := range a.Resources { u, err := url.Parse(p.URL) if err != nil { return err } lookupDiscoverdURLHost(s, u, time.Second) res, err := resource.Provision(u.String(), nil) if err != nil { return err } as.Providers = append(as.Providers, p) as.Resources = append(as.Resources, res) for k, v := range res.Env { a.Release.Env[k] = v } } for typ, count := range a.Processes { if s.Singleton && count > 1 { a.Processes[typ] = 1 count = 1 } hosts := s.ShuffledHosts() if a.ExpandedFormation.Release.Processes[typ].Omni { count = len(hosts) } for i := 0; i < count; i++ { host := hosts[i%len(hosts)] config := utils.JobConfig(a.ExpandedFormation, typ, host.ID(), "") hostresource.SetDefaults(&config.Resources) if a.ExpandedFormation.Release.Processes[typ].Data { if err := utils.ProvisionVolume(host, config); err != nil { return err } } if err := startJob(s, host, config); err != nil { return err } } } return nil }
func (f *ClusterFixer) FixPostgres() error { f.l.Info("checking postgres") service := discoverd.NewService("postgres") leader, _ := service.Leader() if leader == nil || leader.Addr == "" { f.l.Info("no running postgres leader") leader = nil } else { f.l.Info("found running postgres leader") } instances, _ := service.Instances() f.l.Info(fmt.Sprintf("found %d running postgres instances", len(instances))) f.l.Info("getting postgres status") var status *pgmanager.Status if leader != nil && leader.Addr != "" { client := pgmanager.NewClient(leader.Addr) var err error status, err = client.Status() if err != nil { f.l.Error("error getting status from postgres leader", "error", err) } } if status != nil && status.Postgres.ReadWrite { f.l.Info("postgres claims to be read-write") return nil } f.l.Info("getting postgres service metadata") meta, err := discoverd.NewService("postgres").GetMeta() if err != nil { return fmt.Errorf("error getting postgres state from discoverd: %s", err) } var state pgstate.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding postgres state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in postgres state") } f.l.Info("getting postgres primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) job, host, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { if state.Sync != nil { f.l.Error("unable to get primary job info", "error", err) f.l.Info("getting postgres sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) job, host, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { return fmt.Errorf("unable to get postgres primary or sync job details: %s", err) } } else { return fmt.Errorf("unable to get postgres primary job details: %s", err) } } if leader != nil && state.Singleton { return fmt.Errorf("postgres leader is running in singleton mode, unable to fix") } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { f.l.Info("waiting for postgres instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for postgres instance to come up") } }, nil } var wait func() (string, error) have := len(instances) want := 2 if state.Singleton { want = 1 } if have >= want { return fmt.Errorf("already have enough postgres instances, unable to fix") } f.l.Info("attempting to start missing postgres jobs", "want", want, "have", have) if leader == nil { // if no postgres, attempt to start job.ID = cluster.GenerateJobID(host.ID(), "") f.FixJobEnv(job) f.l.Info("starting postgres primary job", "job.id", job.ID) wait, err = waitForInstance(job.ID) if err != nil { return err } if err := host.AddJob(job); err != nil { return fmt.Errorf("error starting postgres primary job on %s: %s", host.ID(), err) } have++ } if want > have { // if not enough postgres instances, start another var secondHost *cluster.Host for _, h := range f.hosts { if h.ID() != host.ID() { secondHost = h break } } if secondHost == nil { // if there are no other hosts, use the same one we put the primary on secondHost = host } job.ID = cluster.GenerateJobID(secondHost.ID(), "") f.FixJobEnv(job) f.l.Info("starting second postgres job", "job.id", job.ID) if wait == nil { wait, err = waitForInstance(job.ID) if err != nil { return err } } if err := utils.ProvisionVolume(secondHost, job); err != nil { return fmt.Errorf("error creating postgres volume on %s: %s", secondHost.ID(), err) } if err := secondHost.AddJob(job); err != nil { return fmt.Errorf("error starting additional postgres job on %s: %s", secondHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil { addr = leader.Addr } f.l.Info("waiting for postgres to come up read-write") return pgmanager.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }
func (a *RunAppAction) Run(s *State) error { if a.AppStep != "" { data, err := getAppStep(s, a.AppStep) if err != nil { return err } a.App = data.App procs := a.Processes a.ExpandedFormation = data.ExpandedFormation a.Processes = procs } as := &RunAppState{ ExpandedFormation: a.ExpandedFormation, Resources: make([]*resource.Resource, 0, len(a.Resources)), Providers: make([]*ct.Provider, 0, len(a.Resources)), } s.StepData[a.ID] = as if a.App == nil { a.App = &ct.App{} } if a.App.ID == "" { a.App.ID = random.UUID() } if a.Artifact == nil { return errors.New("bootstrap: artifact must be set") } if a.Artifact.ID == "" { a.Artifact.ID = random.UUID() } if a.Release == nil { return errors.New("bootstrap: release must be set") } if a.Release.ID == "" { a.Release.ID = random.UUID() } a.Release.ArtifactID = a.Artifact.ID if a.Release.Env == nil { a.Release.Env = make(map[string]string) } interpolateRelease(s, a.Release) for _, p := range a.Resources { u, err := url.Parse(p.URL) if err != nil { return err } lookupDiscoverdURLHost(u, time.Second) res, err := resource.Provision(u.String(), nil) if err != nil { return err } as.Providers = append(as.Providers, p) as.Resources = append(as.Resources, res) for k, v := range res.Env { a.Release.Env[k] = v } } cc, err := s.ClusterClient() if err != nil { return err } for typ, count := range a.Processes { if s.Singleton && count > 1 { a.Processes[typ] = 1 count = 1 } hosts, err := cc.ListHosts() if err != nil { return err } sort.Sort(schedutil.HostSlice(hosts)) for i := 0; i < count; i++ { hostID := hosts[i%len(hosts)].ID config := utils.JobConfig(a.ExpandedFormation, typ, hostID) if a.ExpandedFormation.Release.Processes[typ].Data { if err := utils.ProvisionVolume(cc, hostID, config); err != nil { return err } } job, err := startJob(s, hostID, config) if err != nil { return err } as.Jobs = append(as.Jobs, *job) } } return nil }
func (f *ClusterFixer) FixSirenia(svc string) error { log := f.l.New("fn", "FixSirenia", "service", svc) service := discoverd.NewService(svc) instances, _ := service.Instances() leader, _ := service.Leader() log.Info("getting service metadata") meta, err := service.GetMeta() if err != nil { return fmt.Errorf("error getting sirenia state from discoverd: %s", err) } var state state.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in sirenia state") } log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get primary job info") } var syncJob *host.Job var syncHost *cluster.Host if state.Sync != nil { log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get sync job info") } } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { log.Info("waiting for instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for sirenia instance to come up") } }, nil } log.Info("terminating unassigned sirenia instances") outer: for _, i := range instances { if i.Addr == state.Primary.Addr || (state.Sync != nil && i.Addr == state.Sync.Addr) { continue } for _, a := range state.Async { if i.Addr == a.Addr { continue outer } } // job not assigned in state, attempt to terminate it if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok { hostID, err := cluster.ExtractHostID(jobID) if err != nil { log.Error("error extracting host id from jobID", "jobID", jobID, "err", err) } h := f.Host(hostID) if h != nil { if err := h.StopJob(jobID); err != nil { log.Error("error stopping unassigned sirenia job", "jobID", jobID) } } else { log.Error("host not found", "hostID", hostID) } } } isRunning := func(addr string) bool { for _, i := range instances { if i.Addr == addr { return true } } return false } // if the leader isn't currently running then start it using primaryJob/primaryHost var wait func() (string, error) if !isRunning(state.Primary.Addr) { // if we don't have info about the primary job attempt to promote the sync if primaryJob == nil { if syncJob != nil { // set primary job to sync primaryJob = syncJob primaryHost = syncHost // nil out sync job now so we can re-allocate it. syncJob = nil syncHost = nil } else { return fmt.Errorf("neither primary or sync job info available") } } primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "") f.FixJobEnv(primaryJob) log.Info("starting primary job", "job.id", primaryJob.ID) wait, err = waitForInstance(primaryJob.ID) if err != nil { return err } if err := primaryHost.AddJob(primaryJob); err != nil { return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err) } } if !state.Singleton && !isRunning(state.Sync.Addr) { if syncHost == nil { for _, h := range f.hosts { if h.ID() != primaryHost.ID() { syncHost = h break } } if syncHost == nil { // if there are no other hosts, use the same one we put the primary on syncHost = primaryHost } } // if we don't have a sync job then copy the primary job // and provision a new volume if syncJob == nil { syncJob = primaryJob vol := &ct.VolumeReq{Path: "/data"} if _, err := utils.ProvisionVolume(vol, syncHost, syncJob); err != nil { return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err) } } syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "") f.FixJobEnv(syncJob) log.Info("starting sync job", "job.id", syncJob.ID) if wait == nil { wait, err = waitForInstance(syncJob.ID) if err != nil { return err } } if err := syncHost.AddJob(syncJob); err != nil { return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil && leader.Addr != "" { addr = leader.Addr } log.Info("waiting for cluster to come up read-write", "addr", addr) return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }
func (c *controllerAPI) RunJob(ctx context.Context, w http.ResponseWriter, req *http.Request) { var newJob ct.NewJob if err := httphelper.DecodeJSON(req, &newJob); err != nil { respondWithError(w, err) return } if err := schema.Validate(newJob); err != nil { respondWithError(w, err) return } data, err := c.releaseRepo.Get(newJob.ReleaseID) if err != nil { respondWithError(w, err) return } release := data.(*ct.Release) var artifactIDs []string if len(newJob.ArtifactIDs) > 0 { artifactIDs = newJob.ArtifactIDs } else if len(release.ArtifactIDs) > 0 { artifactIDs = release.ArtifactIDs } else { httphelper.ValidationError(w, "release.ArtifactIDs", "cannot be empty") return } artifacts := make([]*ct.Artifact, len(artifactIDs)) artifactList, err := c.artifactRepo.ListIDs(artifactIDs...) if err != nil { respondWithError(w, err) return } for i, id := range artifactIDs { artifacts[i] = artifactList[id] } var entrypoint ct.ImageEntrypoint if e := utils.GetEntrypoint(artifacts, ""); e != nil { entrypoint = *e } attach := strings.Contains(req.Header.Get("Upgrade"), "flynn-attach/0") hosts, err := c.clusterClient.Hosts() if err != nil { respondWithError(w, err) return } if len(hosts) == 0 { respondWithError(w, errors.New("no hosts found")) return } client := hosts[random.Math.Intn(len(hosts))] uuid := random.UUID() hostID := client.ID() id := cluster.GenerateJobID(hostID, uuid) app := c.getApp(ctx) env := make(map[string]string, len(entrypoint.Env)+len(release.Env)+len(newJob.Env)+4) env["FLYNN_APP_ID"] = app.ID env["FLYNN_RELEASE_ID"] = release.ID env["FLYNN_PROCESS_TYPE"] = "" env["FLYNN_JOB_ID"] = id for k, v := range entrypoint.Env { env[k] = v } if newJob.ReleaseEnv { for k, v := range release.Env { env[k] = v } } for k, v := range newJob.Env { env[k] = v } metadata := make(map[string]string, len(newJob.Meta)+3) for k, v := range newJob.Meta { metadata[k] = v } metadata["flynn-controller.app"] = app.ID metadata["flynn-controller.app_name"] = app.Name metadata["flynn-controller.release"] = release.ID job := &host.Job{ ID: id, Metadata: metadata, Config: host.ContainerConfig{ Args: entrypoint.Args, Env: env, WorkingDir: entrypoint.WorkingDir, Uid: entrypoint.Uid, Gid: entrypoint.Gid, TTY: newJob.TTY, Stdin: attach, DisableLog: newJob.DisableLog, }, Resources: newJob.Resources, Partition: string(newJob.Partition), } resource.SetDefaults(&job.Resources) if len(newJob.Args) > 0 { job.Config.Args = newJob.Args } utils.SetupMountspecs(job, artifacts) // provision data volume if required if newJob.Data { vol := &ct.VolumeReq{Path: "/data", DeleteOnStop: true} if _, err := utils.ProvisionVolume(vol, client, job); err != nil { respondWithError(w, err) return } } var attachClient cluster.AttachClient if attach { attachReq := &host.AttachReq{ JobID: job.ID, Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagStdin | host.AttachFlagStream, Height: uint16(newJob.Lines), Width: uint16(newJob.Columns), } attachClient, err = client.Attach(attachReq, true) if err != nil { respondWithError(w, fmt.Errorf("attach failed: %s", err.Error())) return } defer attachClient.Close() } if err := client.AddJob(job); err != nil { respondWithError(w, fmt.Errorf("schedule failed: %s", err.Error())) return } if attach { // TODO(titanous): This Wait could block indefinitely if something goes // wrong, a context should be threaded in that cancels if the client // goes away. if err := attachClient.Wait(); err != nil { respondWithError(w, fmt.Errorf("attach wait failed: %s", err.Error())) return } w.Header().Set("Connection", "upgrade") w.Header().Set("Upgrade", "flynn-attach/0") w.WriteHeader(http.StatusSwitchingProtocols) conn, _, err := w.(http.Hijacker).Hijack() if err != nil { panic(err) } defer conn.Close() done := make(chan struct{}, 2) cp := func(to io.Writer, from io.Reader) { io.Copy(to, from) done <- struct{}{} } go cp(conn, attachClient.Conn()) go cp(attachClient.Conn(), conn) // Wait for one of the connections to be closed or interrupted. EOF is // framed inside the attach protocol, so a read/write error indicates // that we're done and should clean up. <-done return } else { httphelper.JSON(w, 200, &ct.Job{ ID: job.ID, UUID: uuid, HostID: hostID, ReleaseID: newJob.ReleaseID, Args: newJob.Args, }) } }