// TestMigrateJobStates checks that migrating to ID 9 does not break existing // job records func (MigrateSuite) TestMigrateJobStates(c *C) { db := setupTestDB(c, "controllertest_migrate_job_states") m := &testMigrator{c: c, db: db} // start from ID 7 m.migrateTo(7) // insert a job hostID := "host1" uuid := random.UUID() jobID := cluster.GenerateJobID(hostID, uuid) appID := random.UUID() releaseID := random.UUID() c.Assert(db.Exec(`INSERT INTO apps (app_id, name) VALUES ($1, $2)`, appID, "migrate-app"), IsNil) c.Assert(db.Exec(`INSERT INTO releases (release_id) VALUES ($1)`, releaseID), IsNil) c.Assert(db.Exec(`INSERT INTO job_cache (job_id, app_id, release_id, state) VALUES ($1, $2, $3, $4)`, jobID, appID, releaseID, "up"), IsNil) // migrate to 8 and check job states are still constrained m.migrateTo(8) err := db.Exec(`UPDATE job_cache SET state = 'foo' WHERE job_id = $1`, jobID) c.Assert(err, NotNil) if !postgres.IsPostgresCode(err, postgres.ForeignKeyViolation) { c.Fatalf("expected postgres foreign key violation, got %s", err) } // migrate to 9 and check job IDs are correct, pending state is valid m.migrateTo(9) var clusterID, dbUUID, dbHostID string c.Assert(db.QueryRow("SELECT cluster_id, job_id, host_id FROM job_cache WHERE cluster_id = $1", jobID).Scan(&clusterID, &dbUUID, &dbHostID), IsNil) c.Assert(clusterID, Equals, jobID) c.Assert(dbUUID, Equals, uuid) c.Assert(dbHostID, Equals, hostID) c.Assert(db.Exec(`UPDATE job_cache SET state = 'pending' WHERE job_id = $1`, uuid), IsNil) }
func (f *ClusterFixer) StartAppJob(app, typ, service string) ([]*discoverd.Instance, error) { f.l.Info(fmt.Sprintf("no %s %s process running, getting release details from hosts", app, typ)) releases := f.FindAppReleaseJobs(app, typ) if len(releases) == 0 { return nil, fmt.Errorf("didn't find any %s %s release jobs", app, typ) } // get a job template from the first release var job *host.Job for _, job = range releases[0] { break } host := f.hosts[0] job.ID = cluster.GenerateJobID(host.ID(), "") // provision new temporary volumes for i, v := range job.Config.Volumes { if v.DeleteOnStop { f.l.Info(fmt.Sprintf("provisioning volume for %s %s job", app, typ), "job.id", job.ID, "release", job.Metadata["flynn-controller.release"]) vol, err := host.CreateVolume("default") if err != nil { return nil, fmt.Errorf("error provisioning volume for %s %s job: %s", app, typ, err) } job.Config.Volumes[i].VolumeID = vol.ID } } f.FixJobEnv(job) // run it on the host f.l.Info(fmt.Sprintf("starting %s %s job", app, typ), "job.id", job.ID, "release", job.Metadata["flynn-controller.release"]) if err := host.AddJob(job); err != nil { return nil, fmt.Errorf("error starting %s %s job: %s", app, typ, err) } f.l.Info("waiting for job to start") return discoverd.GetInstances(service, time.Minute) }
func (s *S) TestJobGet(c *C) { app := s.createTestApp(c, &ct.App{Name: "job-get"}) release := s.createTestRelease(c, &ct.Release{}) s.createTestFormation(c, &ct.Formation{ReleaseID: release.ID, AppID: app.ID}) uuid := random.UUID() hostID := "host0" jobID := cluster.GenerateJobID(hostID, uuid) s.createTestJob(c, &ct.Job{ ID: jobID, UUID: uuid, HostID: hostID, AppID: app.ID, ReleaseID: release.ID, Type: "web", State: ct.JobStateStarting, Meta: map[string]string{"some": "info"}, }) // test getting the job with both the job ID and the UUID for _, id := range []string{jobID, uuid} { job, err := s.c.GetJob(app.ID, id) c.Assert(err, IsNil) c.Assert(job.ID, Equals, jobID) c.Assert(job.UUID, Equals, uuid) c.Assert(job.HostID, Equals, hostID) c.Assert(job.AppID, Equals, app.ID) c.Assert(job.ReleaseID, Equals, release.ID) c.Assert(job.Meta, DeepEquals, map[string]string{"some": "info"}) } }
func JobConfig(f *ct.ExpandedFormation, name, hostID string, uuid string) *host.Job { t := f.Release.Processes[name] env := make(map[string]string, len(f.Release.Env)+len(t.Env)+4) for k, v := range f.Release.Env { env[k] = v } for k, v := range t.Env { env[k] = v } id := cluster.GenerateJobID(hostID, uuid) env["FLYNN_APP_ID"] = f.App.ID env["FLYNN_APP_NAME"] = f.App.Name env["FLYNN_RELEASE_ID"] = f.Release.ID env["FLYNN_PROCESS_TYPE"] = name env["FLYNN_JOB_ID"] = id metadata := make(map[string]string, len(f.App.Meta)+4) for k, v := range f.App.Meta { metadata[k] = v } metadata["flynn-controller.app"] = f.App.ID metadata["flynn-controller.app_name"] = f.App.Name metadata["flynn-controller.release"] = f.Release.ID metadata["flynn-controller.formation"] = "true" metadata["flynn-controller.type"] = name job := &host.Job{ ID: id, Metadata: metadata, Config: host.ContainerConfig{ Cmd: t.Cmd, Env: env, HostNetwork: t.HostNetwork, }, Resurrect: t.Resurrect, Resources: t.Resources, } if f.App.Meta["flynn-system-app"] == "true" { job.Partition = "system" } if len(t.Entrypoint) > 0 { job.Config.Entrypoint = t.Entrypoint } if f.ImageArtifact != nil { job.ImageArtifact = f.ImageArtifact.HostArtifact() } if len(f.FileArtifacts) > 0 { job.FileArtifacts = make([]*host.Artifact, len(f.FileArtifacts)) for i, artifact := range f.FileArtifacts { job.FileArtifacts[i] = artifact.HostArtifact() } } job.Config.Ports = make([]host.Port, len(t.Ports)) for i, p := range t.Ports { job.Config.Ports[i].Proto = p.Proto job.Config.Ports[i].Port = p.Port job.Config.Ports[i].Service = p.Service } return job }
func (f *ClusterFixer) FixFlannel() error { f.l.Info("checking flannel") flannelJobs := make(map[string]*host.Job, len(f.hosts)) for _, h := range f.hosts { jobs, err := h.ListJobs() if err != nil { return fmt.Errorf("error getting jobs list from %s: %s", h.ID(), err) } for _, j := range jobs { if j.Status != host.StatusRunning || j.Job.Metadata["flynn-controller.app_name"] != "flannel" || j.Job.Metadata["flynn-controller.type"] != "app" { continue } flannelJobs[h.ID()] = j.Job break } } if len(flannelJobs) == len(f.hosts) { f.l.Info("flannel looks good") return nil } var job *host.Job if len(flannelJobs) == 0 { f.l.Info("flannel not running, starting it on each host") releases := f.FindAppReleaseJobs("flannel", "app") if len(releases) == 0 { return fmt.Errorf("didn't find flannel release jobs") } for _, j := range releases[0] { job = j break } } else { f.l.Info("flannel is not running on each host, starting missing jobs") for _, job = range flannelJobs { break } } for _, h := range f.hosts { if _, ok := flannelJobs[h.ID()]; ok { continue } job.ID = cluster.GenerateJobID(h.ID(), "") f.FixJobEnv(job) if err := h.AddJob(job); err != nil { return fmt.Errorf("error starting flannel job: %s", err) } f.l.Info("started flannel job", "job.id", job.ID) } f.l.Info("flannel fix complete") return nil }
func (s *S) TestKillJob(c *C) { app := s.createTestApp(c, &ct.App{Name: "killjob"}) hostID := fakeHostID() jobID := cluster.GenerateJobID(hostID) hc := tu.NewFakeHostClient(hostID) s.cc.AddHost(hc) c.Assert(s.c.DeleteJob(app.ID, jobID), IsNil) c.Assert(hc.IsStopped(jobID), Equals, true) }
func (c *FakeHostClient) CrashJob(uuid string) error { c.jobsMtx.Lock() defer c.jobsMtx.Unlock() id := cluster.GenerateJobID(c.hostID, uuid) c.stopped[id] = true job, ok := c.Jobs[id] if ok { job.Status = host.StatusCrashed c.Jobs[id] = job return c.stop(id) } else { return ct.NotFoundError{Resource: id} } }
func JobConfig(f *ct.ExpandedFormation, name, hostID string) *host.Job { t := f.Release.Processes[name] env := make(map[string]string, len(f.Release.Env)+len(t.Env)+4) for k, v := range f.Release.Env { env[k] = v } for k, v := range t.Env { env[k] = v } id := cluster.GenerateJobID(hostID) env["FLYNN_APP_ID"] = f.App.ID env["FLYNN_APP_NAME"] = f.App.Name env["FLYNN_RELEASE_ID"] = f.Release.ID env["FLYNN_PROCESS_TYPE"] = name env["FLYNN_JOB_ID"] = id job := &host.Job{ ID: id, Metadata: map[string]string{ "flynn-controller.app": f.App.ID, "flynn-controller.app_name": f.App.Name, "flynn-controller.release": f.Release.ID, "flynn-controller.type": name, }, Artifact: host.Artifact{ Type: f.Artifact.Type, URI: f.Artifact.URI, }, Config: host.ContainerConfig{ Cmd: t.Cmd, Env: env, HostNetwork: t.HostNetwork, }, Resurrect: t.Resurrect, Resources: t.Resources, } if len(t.Entrypoint) > 0 { job.Config.Entrypoint = t.Entrypoint } job.Config.Ports = make([]host.Port, len(t.Ports)) for i, p := range t.Ports { job.Config.Ports[i].Proto = p.Proto job.Config.Ports[i].Port = p.Port job.Config.Ports[i].Service = p.Service } return job }
func (f *ClusterFixer) StartAppJob(app, typ, service string) ([]*discoverd.Instance, error) { f.l.Info(fmt.Sprintf("no %s %s process running, getting release details from hosts", app, typ)) releases := f.FindAppReleaseJobs(app, typ) if len(releases) == 0 { return nil, fmt.Errorf("didn't find any %s %s release jobs", app, typ) } // get a job template from the first release var job *host.Job for _, job = range releases[0] { break } job.ID = cluster.GenerateJobID(f.hosts[0].ID(), "") f.FixJobEnv(job) // run it on a host f.l.Info(fmt.Sprintf("starting %s %s job", app, typ), "job.id", job.ID, "release", job.Metadata["flynn-controller.release"]) if err := f.hosts[0].AddJob(job); err != nil { return nil, fmt.Errorf("error starting %s %s job: %s", app, typ, err) } f.l.Info("waiting for job to start") return discoverd.GetInstances(service, time.Minute) }
func (f *ClusterFixer) FixDiscoverd() error { f.l.Info("ensuring discoverd is running on all hosts") releases := f.FindAppReleaseJobs("discoverd", "app") if len(releases) == 0 { return fmt.Errorf("didn't find any discoverd release jobs") } outer: for hostID, job := range releases[0] { for _, h := range f.hosts { if h.ID() != hostID { continue } // check if discoverd is already running on this host jobs, err := h.ListJobs() if err != nil { return fmt.Errorf("error listing jobs on %s: %s", h.ID(), err) } for _, j := range jobs { if j.Status == host.StatusRunning && j.Job.Metadata["flynn-controller.app_name"] == "discoverd" && j.Job.Metadata["flynn-controller.type"] == "app" { continue outer } } job.ID = cluster.GenerateJobID(h.ID(), "") f.FixJobEnv(job) if err := h.AddJob(job); err != nil { return fmt.Errorf("error starting discoverd on %s: %s", h.ID(), err) } f.l.Info("started discoverd instance", "job.id", job.ID) break } } return nil }
func (s *S) TestKillJob(c *C) { app := s.createTestApp(c, &ct.App{Name: "killjob"}) release := s.createTestRelease(c, &ct.Release{}) hostID := fakeHostID() uuid := random.UUID() jobID := cluster.GenerateJobID(hostID, uuid) s.createTestJob(c, &ct.Job{ ID: jobID, UUID: uuid, HostID: hostID, AppID: app.ID, ReleaseID: release.ID, Type: "web", State: ct.JobStateStarting, Meta: map[string]string{"some": "info"}, }) hc := tu.NewFakeHostClient(hostID, false) hc.AddJob(&host.Job{ID: jobID}) s.cc.AddHost(hc) err := s.c.DeleteJob(app.ID, jobID) c.Assert(err, IsNil) c.Assert(hc.IsStopped(jobID), Equals, true) }
func (c *controllerAPI) RunJob(ctx context.Context, w http.ResponseWriter, req *http.Request) { var newJob ct.NewJob if err := httphelper.DecodeJSON(req, &newJob); err != nil { respondWithError(w, err) return } if err := schema.Validate(newJob); err != nil { respondWithError(w, err) return } data, err := c.releaseRepo.Get(newJob.ReleaseID) if err != nil { respondWithError(w, err) return } release := data.(*ct.Release) var artifactIDs []string if len(newJob.ArtifactIDs) > 0 { artifactIDs = newJob.ArtifactIDs } else if len(release.ArtifactIDs) > 0 { artifactIDs = release.ArtifactIDs } else { httphelper.ValidationError(w, "release.ArtifactIDs", "cannot be empty") return } artifacts := make([]*ct.Artifact, len(artifactIDs)) artifactList, err := c.artifactRepo.ListIDs(artifactIDs...) if err != nil { respondWithError(w, err) return } for i, id := range artifactIDs { artifacts[i] = artifactList[id] } var entrypoint ct.ImageEntrypoint if e := utils.GetEntrypoint(artifacts, ""); e != nil { entrypoint = *e } attach := strings.Contains(req.Header.Get("Upgrade"), "flynn-attach/0") hosts, err := c.clusterClient.Hosts() if err != nil { respondWithError(w, err) return } if len(hosts) == 0 { respondWithError(w, errors.New("no hosts found")) return } client := hosts[random.Math.Intn(len(hosts))] uuid := random.UUID() hostID := client.ID() id := cluster.GenerateJobID(hostID, uuid) app := c.getApp(ctx) env := make(map[string]string, len(entrypoint.Env)+len(release.Env)+len(newJob.Env)+4) env["FLYNN_APP_ID"] = app.ID env["FLYNN_RELEASE_ID"] = release.ID env["FLYNN_PROCESS_TYPE"] = "" env["FLYNN_JOB_ID"] = id for k, v := range entrypoint.Env { env[k] = v } if newJob.ReleaseEnv { for k, v := range release.Env { env[k] = v } } for k, v := range newJob.Env { env[k] = v } metadata := make(map[string]string, len(newJob.Meta)+3) for k, v := range newJob.Meta { metadata[k] = v } metadata["flynn-controller.app"] = app.ID metadata["flynn-controller.app_name"] = app.Name metadata["flynn-controller.release"] = release.ID job := &host.Job{ ID: id, Metadata: metadata, Config: host.ContainerConfig{ Args: entrypoint.Args, Env: env, WorkingDir: entrypoint.WorkingDir, Uid: entrypoint.Uid, Gid: entrypoint.Gid, TTY: newJob.TTY, Stdin: attach, DisableLog: newJob.DisableLog, }, Resources: newJob.Resources, Partition: string(newJob.Partition), } resource.SetDefaults(&job.Resources) if len(newJob.Args) > 0 { job.Config.Args = newJob.Args } utils.SetupMountspecs(job, artifacts) // provision data volume if required if newJob.Data { vol := &ct.VolumeReq{Path: "/data", DeleteOnStop: true} if _, err := utils.ProvisionVolume(vol, client, job); err != nil { respondWithError(w, err) return } } var attachClient cluster.AttachClient if attach { attachReq := &host.AttachReq{ JobID: job.ID, Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagStdin | host.AttachFlagStream, Height: uint16(newJob.Lines), Width: uint16(newJob.Columns), } attachClient, err = client.Attach(attachReq, true) if err != nil { respondWithError(w, fmt.Errorf("attach failed: %s", err.Error())) return } defer attachClient.Close() } if err := client.AddJob(job); err != nil { respondWithError(w, fmt.Errorf("schedule failed: %s", err.Error())) return } if attach { // TODO(titanous): This Wait could block indefinitely if something goes // wrong, a context should be threaded in that cancels if the client // goes away. if err := attachClient.Wait(); err != nil { respondWithError(w, fmt.Errorf("attach wait failed: %s", err.Error())) return } w.Header().Set("Connection", "upgrade") w.Header().Set("Upgrade", "flynn-attach/0") w.WriteHeader(http.StatusSwitchingProtocols) conn, _, err := w.(http.Hijacker).Hijack() if err != nil { panic(err) } defer conn.Close() done := make(chan struct{}, 2) cp := func(to io.Writer, from io.Reader) { io.Copy(to, from) done <- struct{}{} } go cp(conn, attachClient.Conn()) go cp(attachClient.Conn(), conn) // Wait for one of the connections to be closed or interrupted. EOF is // framed inside the attach protocol, so a read/write error indicates // that we're done and should clean up. <-done return } else { httphelper.JSON(w, 200, &ct.Job{ ID: job.ID, UUID: uuid, HostID: hostID, ReleaseID: newJob.ReleaseID, Args: newJob.Args, }) } }
func (c *controllerAPI) RunJob(ctx context.Context, w http.ResponseWriter, req *http.Request) { var newJob ct.NewJob if err := httphelper.DecodeJSON(req, &newJob); err != nil { respondWithError(w, err) return } if err := schema.Validate(newJob); err != nil { respondWithError(w, err) return } data, err := c.releaseRepo.Get(newJob.ReleaseID) if err != nil { respondWithError(w, err) return } release := data.(*ct.Release) data, err = c.artifactRepo.Get(release.ArtifactID) if err != nil { respondWithError(w, err) return } artifact := data.(*ct.Artifact) attach := strings.Contains(req.Header.Get("Upgrade"), "flynn-attach/0") hosts, err := c.clusterClient.Hosts() if err != nil { respondWithError(w, err) return } if len(hosts) == 0 { respondWithError(w, errors.New("no hosts found")) return } client := hosts[random.Math.Intn(len(hosts))] id := cluster.GenerateJobID(client.ID(), "") app := c.getApp(ctx) env := make(map[string]string, len(release.Env)+len(newJob.Env)+4) env["FLYNN_APP_ID"] = app.ID env["FLYNN_RELEASE_ID"] = release.ID env["FLYNN_PROCESS_TYPE"] = "" env["FLYNN_JOB_ID"] = id if newJob.ReleaseEnv { for k, v := range release.Env { env[k] = v } } for k, v := range newJob.Env { env[k] = v } metadata := make(map[string]string, len(newJob.Meta)+3) for k, v := range newJob.Meta { metadata[k] = v } metadata["flynn-controller.app"] = app.ID metadata["flynn-controller.app_name"] = app.Name metadata["flynn-controller.release"] = release.ID job := &host.Job{ ID: id, Metadata: metadata, Artifact: host.Artifact{ Type: artifact.Type, URI: artifact.URI, }, Config: host.ContainerConfig{ Cmd: newJob.Cmd, Env: env, TTY: newJob.TTY, Stdin: attach, DisableLog: newJob.DisableLog, }, Resources: newJob.Resources, } resource.SetDefaults(&job.Resources) if len(newJob.Entrypoint) > 0 { job.Config.Entrypoint = newJob.Entrypoint } var attachClient cluster.AttachClient if attach { attachReq := &host.AttachReq{ JobID: job.ID, Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagStdin | host.AttachFlagStream, Height: uint16(newJob.Lines), Width: uint16(newJob.Columns), } attachClient, err = client.Attach(attachReq, true) if err != nil { respondWithError(w, fmt.Errorf("attach failed: %s", err.Error())) return } defer attachClient.Close() } if err := client.AddJob(job); err != nil { respondWithError(w, fmt.Errorf("schedule failed: %s", err.Error())) return } if attach { if err := attachClient.Wait(); err != nil { respondWithError(w, fmt.Errorf("attach wait failed: %s", err.Error())) return } w.Header().Set("Connection", "upgrade") w.Header().Set("Upgrade", "flynn-attach/0") w.WriteHeader(http.StatusSwitchingProtocols) conn, _, err := w.(http.Hijacker).Hijack() if err != nil { panic(err) } defer conn.Close() done := make(chan struct{}, 2) cp := func(to io.Writer, from io.Reader) { io.Copy(to, from) done <- struct{}{} } go cp(conn, attachClient.Conn()) go cp(attachClient.Conn(), conn) <-done <-done return } else { httphelper.JSON(w, 200, &ct.Job{ ID: job.ID, ReleaseID: newJob.ReleaseID, Cmd: newJob.Cmd, }) } }
func (f *ClusterFixer) FixPostgres() error { f.l.Info("checking postgres") service := discoverd.NewService("postgres") leader, _ := service.Leader() if leader == nil || leader.Addr == "" { f.l.Info("no running postgres leader") leader = nil } else { f.l.Info("found running postgres leader") } instances, _ := service.Instances() f.l.Info(fmt.Sprintf("found %d running postgres instances", len(instances))) f.l.Info("getting postgres status") var status *pgmanager.Status if leader != nil && leader.Addr != "" { client := pgmanager.NewClient(leader.Addr) var err error status, err = client.Status() if err != nil { f.l.Error("error getting status from postgres leader", "error", err) } } if status != nil && status.Postgres.ReadWrite { f.l.Info("postgres claims to be read-write") return nil } f.l.Info("getting postgres service metadata") meta, err := discoverd.NewService("postgres").GetMeta() if err != nil { return fmt.Errorf("error getting postgres state from discoverd: %s", err) } var state pgstate.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding postgres state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in postgres state") } f.l.Info("getting postgres primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) job, host, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { if state.Sync != nil { f.l.Error("unable to get primary job info", "error", err) f.l.Info("getting postgres sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) job, host, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { return fmt.Errorf("unable to get postgres primary or sync job details: %s", err) } } else { return fmt.Errorf("unable to get postgres primary job details: %s", err) } } if leader != nil && state.Singleton { return fmt.Errorf("postgres leader is running in singleton mode, unable to fix") } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { f.l.Info("waiting for postgres instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for postgres instance to come up") } }, nil } var wait func() (string, error) have := len(instances) want := 2 if state.Singleton { want = 1 } if have >= want { return fmt.Errorf("already have enough postgres instances, unable to fix") } f.l.Info("attempting to start missing postgres jobs", "want", want, "have", have) if leader == nil { // if no postgres, attempt to start job.ID = cluster.GenerateJobID(host.ID(), "") f.FixJobEnv(job) f.l.Info("starting postgres primary job", "job.id", job.ID) wait, err = waitForInstance(job.ID) if err != nil { return err } if err := host.AddJob(job); err != nil { return fmt.Errorf("error starting postgres primary job on %s: %s", host.ID(), err) } have++ } if want > have { // if not enough postgres instances, start another var secondHost *cluster.Host for _, h := range f.hosts { if h.ID() != host.ID() { secondHost = h break } } if secondHost == nil { // if there are no other hosts, use the same one we put the primary on secondHost = host } job.ID = cluster.GenerateJobID(secondHost.ID(), "") f.FixJobEnv(job) f.l.Info("starting second postgres job", "job.id", job.ID) if wait == nil { wait, err = waitForInstance(job.ID) if err != nil { return err } } if err := utils.ProvisionVolume(secondHost, job); err != nil { return fmt.Errorf("error creating postgres volume on %s: %s", secondHost.ID(), err) } if err := secondHost.AddJob(job); err != nil { return fmt.Errorf("error starting additional postgres job on %s: %s", secondHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil { addr = leader.Addr } f.l.Info("waiting for postgres to come up read-write") return pgmanager.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }
/* Restore prior state from the save location defined at construction time. If the state save file is empty, nothing is loaded, and no error is returned. */ func (s *State) Restore(backend Backend, buffers host.LogBuffers) (func(), error) { if err := s.Acquire(); err != nil { return nil, err } defer s.Release() s.backend = backend var resurrect []*host.Job if err := s.stateDB.View(func(tx *bolt.Tx) error { jobsBucket := tx.Bucket([]byte("jobs")) backendJobsBucket := tx.Bucket([]byte("backend-jobs")) backendGlobalBucket := tx.Bucket([]byte("backend-global")) persistentBucket := tx.Bucket([]byte("persistent-jobs")) // restore jobs if err := jobsBucket.ForEach(func(k, v []byte) error { job := &host.ActiveJob{} if err := json.Unmarshal(v, job); err != nil { return err } if job.CreatedAt.IsZero() { job.CreatedAt = time.Now() } s.jobs[string(k)] = job return nil }); err != nil { return err } // hand opaque blobs back to backend so it can do its restore backendJobsBlobs := make(map[string][]byte) if err := backendJobsBucket.ForEach(func(k, v []byte) error { backendJobsBlobs[string(k)] = v return nil }); err != nil { return err } backendGlobalBlob := backendGlobalBucket.Get([]byte("backend")) if err := backend.UnmarshalState(s.jobs, backendJobsBlobs, backendGlobalBlob, buffers); err != nil { return err } // resurrect any persistent jobs which are not running if err := persistentBucket.ForEach(func(k, v []byte) error { for _, job := range s.jobs { if job.Job.ID == string(v) && !backend.JobExists(job.Job.ID) { resurrect = append(resurrect, job.Job) } } return nil }); err != nil { return err } return nil }); err != nil && err != io.EOF { return nil, fmt.Errorf("could not restore from host persistence db: %s", err) } return func() { if len(resurrect) == 0 { return } var wg sync.WaitGroup wg.Add(len(resurrect)) for _, job := range resurrect { go func(job *host.Job) { // generate a new job id, this is a new job newJob := job.Dup() newJob.ID = cluster.GenerateJobID(s.id, "") if _, ok := newJob.Config.Env["FLYNN_JOB_ID"]; ok { newJob.Config.Env["FLYNN_JOB_ID"] = newJob.ID } log.Printf("resurrecting %s as %s", job.ID, newJob.ID) s.AddJob(newJob) backend.Run(newJob, nil, nil) wg.Done() }(job) } wg.Wait() }, nil }
func (f *ClusterFixer) FixSirenia(svc string) error { log := f.l.New("fn", "FixSirenia", "service", svc) service := discoverd.NewService(svc) instances, _ := service.Instances() leader, _ := service.Leader() log.Info("getting service metadata") meta, err := service.GetMeta() if err != nil { return fmt.Errorf("error getting sirenia state from discoverd: %s", err) } var state state.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in sirenia state") } log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get primary job info") } var syncJob *host.Job var syncHost *cluster.Host if state.Sync != nil { log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get sync job info") } } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { log.Info("waiting for instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for sirenia instance to come up") } }, nil } log.Info("terminating unassigned sirenia instances") outer: for _, i := range instances { if i.Addr == state.Primary.Addr || (state.Sync != nil && i.Addr == state.Sync.Addr) { continue } for _, a := range state.Async { if i.Addr == a.Addr { continue outer } } // job not assigned in state, attempt to terminate it if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok { hostID, err := cluster.ExtractHostID(jobID) if err != nil { log.Error("error extracting host id from jobID", "jobID", jobID, "err", err) } h := f.Host(hostID) if h != nil { if err := h.StopJob(jobID); err != nil { log.Error("error stopping unassigned sirenia job", "jobID", jobID) } } else { log.Error("host not found", "hostID", hostID) } } } isRunning := func(addr string) bool { for _, i := range instances { if i.Addr == addr { return true } } return false } // if the leader isn't currently running then start it using primaryJob/primaryHost var wait func() (string, error) if !isRunning(state.Primary.Addr) { // if we don't have info about the primary job attempt to promote the sync if primaryJob == nil { if syncJob != nil { // set primary job to sync primaryJob = syncJob primaryHost = syncHost // nil out sync job now so we can re-allocate it. syncJob = nil syncHost = nil } else { return fmt.Errorf("neither primary or sync job info available") } } primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "") f.FixJobEnv(primaryJob) log.Info("starting primary job", "job.id", primaryJob.ID) wait, err = waitForInstance(primaryJob.ID) if err != nil { return err } if err := primaryHost.AddJob(primaryJob); err != nil { return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err) } } if !state.Singleton && !isRunning(state.Sync.Addr) { if syncHost == nil { for _, h := range f.hosts { if h.ID() != primaryHost.ID() { syncHost = h break } } if syncHost == nil { // if there are no other hosts, use the same one we put the primary on syncHost = primaryHost } } // if we don't have a sync job then copy the primary job // and provision a new volume if syncJob == nil { syncJob = primaryJob vol := &ct.VolumeReq{Path: "/data"} if _, err := utils.ProvisionVolume(vol, syncHost, syncJob); err != nil { return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err) } } syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "") f.FixJobEnv(syncJob) log.Info("starting sync job", "job.id", syncJob.ID) if wait == nil { wait, err = waitForInstance(syncJob.ID) if err != nil { return err } } if err := syncHost.AddJob(syncJob); err != nil { return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil && leader.Addr != "" { addr = leader.Addr } log.Info("waiting for cluster to come up read-write", "addr", addr) return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }
func (TestSuite) TestMultipleHosts(c *C) { hosts := newTestHosts() fakeCluster := newTestCluster(hosts) s := newTestScheduler(c, fakeCluster, true) // use incremental job IDs so we can find them easily in s.jobs var jobID uint64 s.generateJobUUID = func() string { return fmt.Sprintf("job%d", atomic.AddUint64(&jobID, 1)) } s.maxHostChecks = 1 go s.Run() defer s.Stop() assertJobs := func(expected map[string]*Job) { jobs := s.Jobs() c.Assert(jobs, HasLen, len(expected)) for id, job := range expected { actual, ok := jobs[id] if !ok { c.Fatalf("%s does not exist in s.jobs", id) } c.Assert(actual.Type, Equals, job.Type) c.Assert(actual.state, Equals, job.state) c.Assert(actual.HostID, Equals, job.HostID) } } c.Log("Initialize the cluster with 1 host and wait for a job to start on it.") s.waitJobStart() assertJobs(map[string]*Job{ "job1": {Type: "web", state: JobStateStarting, HostID: testHostID}, }) c.Log("Add a host to the cluster, then create a new app, artifact, release, and associated formation.") h2 := NewFakeHostClient("host2") fakeCluster.AddHost(h2) hosts[h2.ID()] = h2 app := &ct.App{ID: "test-app-2", Name: "test-app-2"} artifact := &ct.Artifact{ID: "test-artifact-2"} processes := map[string]int{"omni": 1} release := NewReleaseOmni("test-release-2", artifact, processes, true) c.Log("Add the formation to the controller. Wait for formation change and job start on both hosts.") s.CreateApp(app) s.CreateArtifact(artifact) s.CreateRelease(release) s.PutFormation(&ct.Formation{AppID: app.ID, ReleaseID: release.ID, Processes: processes}) s.waitFormationChange() s.waitJobStart() s.waitJobStart() assertJobs(map[string]*Job{ "job1": {Type: "web", state: JobStateStarting, HostID: "host1"}, "job2": {Type: "omni", state: JobStateStarting, HostID: "host1"}, "job3": {Type: "omni", state: JobStateStarting, HostID: "host2"}, }) assertHostJobs := func(host *FakeHostClient, ids ...string) { jobs, err := host.ListJobs() c.Assert(err, IsNil) c.Assert(jobs, HasLen, len(ids)) for _, id := range ids { id = cluster.GenerateJobID(host.ID(), id) job, ok := jobs[id] if !ok { c.Fatalf("%s missing job with ID %s", host.ID(), id) } c.Assert(job.Job.ID, Equals, id) } } h1 := hosts[testHostID] assertHostJobs(h1, "job1", "job2") assertHostJobs(h2, "job3") h3 := NewFakeHostClient("host3") c.Log("Add a host, wait for omni job start on that host.") fakeCluster.AddHost(h3) s.waitJobStart() assertJobs(map[string]*Job{ "job1": {Type: "web", state: JobStateStarting, HostID: "host1"}, "job2": {Type: "omni", state: JobStateStarting, HostID: "host1"}, "job3": {Type: "omni", state: JobStateStarting, HostID: "host2"}, "job4": {Type: "omni", state: JobStateStarting, HostID: "host3"}, }) assertHostJobs(h3, "job4") c.Log("Crash one of the omni jobs, and wait for it to restart") h3.CrashJob("job4") s.waitJobStop() s.waitJobStart() s.waitRectify() assertJobs(map[string]*Job{ "job1": {Type: "web", state: JobStateStarting, HostID: "host1"}, "job2": {Type: "omni", state: JobStateStarting, HostID: "host1"}, "job3": {Type: "omni", state: JobStateStarting, HostID: "host2"}, "job4": {Type: "omni", state: JobStateStopped, HostID: "host3"}, "job5": {Type: "omni", state: JobStateStarting, HostID: "host3"}, }) assertHostJobs(h3, "job5") c.Log("Unbalance the omni jobs, wait for them to be re-balanced") // pause the scheduler so we can unbalance the jobs without it trying // to rectify the situation s.Pause() // move host3's job to host2 id := cluster.GenerateJobID(h3.ID(), "job5") job, err := h3.GetJob(id) c.Assert(err, IsNil) newJob := job.Job.Dup() newJob.ID = cluster.GenerateJobID(h2.ID(), s.generateJobUUID()) h2.AddJob(newJob) err = h3.StopJob(id) c.Assert(err, IsNil) // resume the scheduler and check it moves the job back to host3 s.Resume() s.waitRectify() s.waitJobStart() assertJobs(map[string]*Job{ "job1": {Type: "web", state: JobStateStarting, HostID: "host1"}, "job2": {Type: "omni", state: JobStateStarting, HostID: "host1"}, "job3": {Type: "omni", state: JobStateStarting, HostID: "host2"}, "job4": {Type: "omni", state: JobStateStopped, HostID: "host3"}, "job5": {Type: "omni", state: JobStateStopped, HostID: "host3"}, "job6": {Type: "omni", state: JobStateStopped, HostID: "host2"}, "job7": {Type: "omni", state: JobStateStarting, HostID: "host3"}, }) c.Logf("Remove one of the hosts. Ensure the cluster recovers correctly (hosts=%v)", hosts) h3.Healthy = false fakeCluster.SetHosts(hosts) s.waitFormationSync() s.waitRectify() assertJobs(map[string]*Job{ "job1": {Type: "web", state: JobStateStarting, HostID: "host1"}, "job2": {Type: "omni", state: JobStateStarting, HostID: "host1"}, "job3": {Type: "omni", state: JobStateStarting, HostID: "host2"}, "job4": {Type: "omni", state: JobStateStopped, HostID: "host3"}, "job5": {Type: "omni", state: JobStateStopped, HostID: "host3"}, "job6": {Type: "omni", state: JobStateStopped, HostID: "host2"}, "job7": {Type: "omni", state: JobStateStopped, HostID: "host3"}, }) assertHostJobs(h1, "job1", "job2") assertHostJobs(h2, "job3") c.Logf("Remove another host. Ensure the cluster recovers correctly (hosts=%v)", hosts) h1.Healthy = false fakeCluster.RemoveHost(testHostID) s.waitFormationSync() s.waitRectify() s.waitJobStart() assertJobs(map[string]*Job{ "job1": {Type: "web", state: JobStateStopped, HostID: "host1"}, "job2": {Type: "omni", state: JobStateStopped, HostID: "host1"}, "job3": {Type: "omni", state: JobStateStarting, HostID: "host2"}, "job4": {Type: "omni", state: JobStateStopped, HostID: "host3"}, "job5": {Type: "omni", state: JobStateStopped, HostID: "host3"}, "job6": {Type: "omni", state: JobStateStopped, HostID: "host2"}, "job7": {Type: "omni", state: JobStateStopped, HostID: "host3"}, "job8": {Type: "web", state: JobStateStarting, HostID: "host2"}, }) assertHostJobs(h2, "job3", "job8") }
/* Restore prior state from the save location defined at construction time. If the state save file is empty, nothing is loaded, and no error is returned. */ func (s *State) Restore(backend Backend, buffers host.LogBuffers) (func(), error) { if err := s.Acquire(); err != nil { return nil, err } defer s.Release() s.backend = backend var resurrect []*host.ActiveJob if err := s.stateDB.View(func(tx *bolt.Tx) error { jobsBucket := tx.Bucket([]byte("jobs")) backendJobsBucket := tx.Bucket([]byte("backend-jobs")) backendGlobalBucket := tx.Bucket([]byte("backend-global")) resurrectionBucket := tx.Bucket([]byte("resurrection-jobs")) // restore jobs if err := jobsBucket.ForEach(func(k, v []byte) error { job := &host.ActiveJob{} if err := json.Unmarshal(v, job); err != nil { return err } if job.ContainerID != "" { s.containers[job.ContainerID] = job } s.jobs[string(k)] = job return nil }); err != nil { return err } // hand opaque blobs back to backend so it can do its restore backendJobsBlobs := make(map[string][]byte) if err := backendJobsBucket.ForEach(func(k, v []byte) error { backendJobsBlobs[string(k)] = v return nil }); err != nil { return err } backendGlobalBlob := backendGlobalBucket.Get([]byte("backend")) if err := backend.UnmarshalState(s.jobs, backendJobsBlobs, backendGlobalBlob, buffers); err != nil { return err } if resurrectionBucket == nil { s.mtx.Lock() for _, job := range s.jobs { // if there was an unclean shutdown, we resurrect all jobs marked // that were running at shutdown and are no longer running. if job.Job.Resurrect && job.Status != host.StatusRunning { resurrect = append(resurrect, job) } } s.mtx.Unlock() } else { defer tx.DeleteBucket([]byte("resurrection-jobs")) if err := resurrectionBucket.ForEach(func(k, v []byte) error { job := &host.ActiveJob{} if err := json.Unmarshal(v, job); err != nil { return err } resurrect = append(resurrect, job) return nil }); err != nil { return err } } return nil }); err != nil && err != io.EOF { return nil, fmt.Errorf("could not restore from host persistence db: %s", err) } return func() { var wg sync.WaitGroup wg.Add(len(resurrect)) for _, job := range resurrect { go func(job *host.ActiveJob) { // generate a new job id, this is a new job newID := cluster.GenerateJobID(s.id, "") log.Printf("resurrecting %s as %s", job.Job.ID, newID) job.Job.ID = newID config := &RunConfig{ // TODO(titanous): Use Job instead of ActiveJob in // resurrection bucket once InternalIP is not used. // TODO(titanous): Passing the IP is a hack, remove it once the // postgres appliance doesn't use it to calculate its ID in the // state machine. IP: net.ParseIP(job.InternalIP), } backend.Run(job.Job, config) wg.Done() }(job) } wg.Wait() }, nil }
func JobConfig(f *ct.ExpandedFormation, name, hostID string, uuid string) *host.Job { t := f.Release.Processes[name] var entrypoint ct.ImageEntrypoint if e := GetEntrypoint(f.Artifacts, name); e != nil { entrypoint = *e } env := make(map[string]string, len(entrypoint.Env)+len(f.Release.Env)+len(t.Env)+5) for k, v := range entrypoint.Env { env[k] = v } for k, v := range f.Release.Env { env[k] = v } for k, v := range t.Env { env[k] = v } id := cluster.GenerateJobID(hostID, uuid) env["FLYNN_APP_ID"] = f.App.ID env["FLYNN_APP_NAME"] = f.App.Name env["FLYNN_RELEASE_ID"] = f.Release.ID env["FLYNN_PROCESS_TYPE"] = name env["FLYNN_JOB_ID"] = id metadata := make(map[string]string, len(f.App.Meta)+5) for k, v := range f.App.Meta { metadata[k] = v } metadata["flynn-controller.app"] = f.App.ID metadata["flynn-controller.app_name"] = f.App.Name metadata["flynn-controller.release"] = f.Release.ID metadata["flynn-controller.formation"] = "true" metadata["flynn-controller.type"] = name job := &host.Job{ ID: id, Metadata: metadata, Config: host.ContainerConfig{ Args: entrypoint.Args, Env: env, WorkingDir: entrypoint.WorkingDir, Uid: entrypoint.Uid, Gid: entrypoint.Gid, HostNetwork: t.HostNetwork, }, Resurrect: t.Resurrect, Resources: t.Resources, } if len(t.Args) > 0 { job.Config.Args = t.Args } // job.Config.Args may be empty if restoring from an old backup which // still uses the deprecated Entrypoint / Cmd fields if len(job.Config.Args) == 0 { job.Config.Args = append(t.DeprecatedEntrypoint, t.DeprecatedCmd...) } SetupMountspecs(job, f.Artifacts) if f.App.Meta["flynn-system-app"] == "true" { job.Partition = "system" } job.Config.Ports = make([]host.Port, len(t.Ports)) for i, p := range t.Ports { job.Config.Ports[i].Proto = p.Proto job.Config.Ports[i].Port = p.Port job.Config.Ports[i].Service = p.Service } return job }
func (TestSuite) TestMultipleHosts(c *C) { hosts := newTestHosts() host1 := hosts[testHostID] fakeCluster := newTestCluster(hosts) s := newTestScheduler(c, fakeCluster, true, nil) // use incremental job IDs so we can find them easily in s.jobs var jobID uint64 s.generateJobUUID = func() string { return fmt.Sprintf("job%d", atomic.AddUint64(&jobID, 1)) } s.maxHostChecks = 1 go s.Run() defer s.Stop() // assertJobs checks that hosts have expected jobs based on their type // and current state type hostJobs map[utils.HostClient][]*Job assertJobs := func(expected hostJobs) { // get a sorted list of scheduler jobs per host to compare // against the expected list actual := make(map[string]sortJobs) for _, job := range s.InternalState().Jobs { actual[job.HostID] = append(actual[job.HostID], job) } for _, jobs := range actual { jobs.SortReverse() } for host, jobs := range expected { actual := actual[host.ID()] if len(actual) != len(jobs) { c.Fatalf("expected %s to have %d jobs, got %d", host.ID(), len(jobs), len(actual)) } for i, job := range jobs { j := actual[i] c.Assert(j.Type, Equals, job.Type) c.Assert(j.State, Equals, job.State) // check the host has the job if it is running (stopped // jobs are removed from the host) if job.State != JobStateStarting { continue } id := cluster.GenerateJobID(host.ID(), j.ID) hostJob, err := host.GetJob(id) c.Assert(err, IsNil) c.Assert(hostJob.Job.ID, Equals, id) } } } c.Log("Initialize the cluster with 1 host and wait for a job to start on it.") s.waitJobStart() assertJobs(hostJobs{ host1: { {Type: "web", State: JobStateStarting}, }, }) c.Log("Add a host to the cluster, then create a new app, artifact, release, and associated formation.") host2 := NewFakeHostClient("host2", true) fakeCluster.AddHost(host2) hosts[host2.ID()] = host2 app := &ct.App{ID: "test-app-2", Name: "test-app-2"} artifact := &ct.Artifact{ID: "test-artifact-2"} processes := map[string]int{"omni": 1} release := NewReleaseOmni("test-release-2", artifact, processes, true) c.Log("Add the formation to the controller. Wait for formation change and job start on both hosts.") s.CreateApp(app) s.CreateArtifact(artifact) s.CreateRelease(release) s.PutFormation(&ct.Formation{AppID: app.ID, ReleaseID: release.ID, Processes: processes}) s.waitJobStart() s.waitJobStart() assertJobs(hostJobs{ host1: { {Type: "web", State: JobStateStarting}, {Type: "omni", State: JobStateStarting}, }, host2: { {Type: "omni", State: JobStateStarting}, }, }) host3 := NewFakeHostClient("host3", true) c.Log("Add a host, wait for omni job start on that host.") fakeCluster.AddHost(host3) s.waitJobStart() assertJobs(hostJobs{ host1: { {Type: "web", State: JobStateStarting}, {Type: "omni", State: JobStateStarting}, }, host2: { {Type: "omni", State: JobStateStarting}, }, host3: { {Type: "omni", State: JobStateStarting}, }, }) c.Log("Crash one of the omni jobs, and wait for it to restart") host3.CrashJob("job4") s.waitJobStop() s.waitJobStart() assertJobs(hostJobs{ host1: { {Type: "web", State: JobStateStarting}, {Type: "omni", State: JobStateStarting}, }, host2: { {Type: "omni", State: JobStateStarting}, }, host3: { {Type: "omni", State: JobStateStopped}, {Type: "omni", State: JobStateStarting}, }, }) c.Log("Unbalance the omni jobs, wait for them to be re-balanced") // pause the scheduler so we can unbalance the jobs without it trying // to rectify the situation s.Pause() // move host2's job to host3 var job *host.ActiveJob jobs, err := host2.ListJobs() for _, j := range jobs { if j.Status == host.StatusStarting { job = &j break } } if job == nil { c.Fatal("could not find host2's omni job") } newJob := job.Job.Dup() newJob.ID = cluster.GenerateJobID(host3.ID(), s.generateJobUUID()) host3.AddJob(newJob) err = host2.StopJob(job.Job.ID) c.Assert(err, IsNil) // resume the scheduler and check it moves the job back to host2 s.Resume() s.waitJobStart() s.waitJobStart() assertJobs(hostJobs{ host1: { {Type: "web", State: JobStateStarting}, {Type: "omni", State: JobStateStarting}, }, host2: { {Type: "omni", State: JobStateStopped}, {Type: "omni", State: JobStateStarting}, }, host3: { {Type: "omni", State: JobStateStopped}, {Type: "omni", State: JobStateStarting}, {Type: "omni", State: JobStateStopped}, }, }) c.Logf("Remove one of the hosts. Ensure the cluster recovers correctly (hosts=%v)", hosts) host3.Healthy = false fakeCluster.SetHosts(hosts) s.waitFormationSync() s.waitRectify() assertJobs(hostJobs{ host1: { {Type: "web", State: JobStateStarting}, {Type: "omni", State: JobStateStarting}, }, host2: { {Type: "omni", State: JobStateStopped}, {Type: "omni", State: JobStateStarting}, }, host3: { {Type: "omni", State: JobStateStopped}, {Type: "omni", State: JobStateStopped}, {Type: "omni", State: JobStateStopped}, }, }) c.Logf("Remove another host. Ensure the cluster recovers correctly (hosts=%v)", hosts) host1.(*FakeHostClient).Healthy = false fakeCluster.RemoveHost(host1.ID()) s.waitFormationSync() s.waitJobStart() assertJobs(hostJobs{ host1: { {Type: "web", State: JobStateStopped}, {Type: "omni", State: JobStateStopped}, }, host2: { {Type: "omni", State: JobStateStopped}, {Type: "omni", State: JobStateStarting}, {Type: "web", State: JobStateStarting}, }, host3: { {Type: "omni", State: JobStateStopped}, {Type: "omni", State: JobStateStopped}, {Type: "omni", State: JobStateStopped}, }, }) }
func (c *controllerAPI) RunJob(ctx context.Context, w http.ResponseWriter, req *http.Request) { var newJob ct.NewJob if err := httphelper.DecodeJSON(req, &newJob); err != nil { respondWithError(w, err) return } if err := schema.Validate(newJob); err != nil { respondWithError(w, err) return } data, err := c.releaseRepo.Get(newJob.ReleaseID) if err != nil { respondWithError(w, err) return } release := data.(*ct.Release) if release.ImageArtifactID() == "" { httphelper.ValidationError(w, "release.ImageArtifact", "must be set") return } attach := strings.Contains(req.Header.Get("Upgrade"), "flynn-attach/0") hosts, err := c.clusterClient.Hosts() if err != nil { respondWithError(w, err) return } if len(hosts) == 0 { respondWithError(w, errors.New("no hosts found")) return } client := hosts[random.Math.Intn(len(hosts))] uuid := random.UUID() hostID := client.ID() id := cluster.GenerateJobID(hostID, uuid) app := c.getApp(ctx) env := make(map[string]string, len(release.Env)+len(newJob.Env)+4) env["FLYNN_APP_ID"] = app.ID env["FLYNN_RELEASE_ID"] = release.ID env["FLYNN_PROCESS_TYPE"] = "" env["FLYNN_JOB_ID"] = id if newJob.ReleaseEnv { for k, v := range release.Env { env[k] = v } } for k, v := range newJob.Env { env[k] = v } metadata := make(map[string]string, len(newJob.Meta)+3) for k, v := range newJob.Meta { metadata[k] = v } metadata["flynn-controller.app"] = app.ID metadata["flynn-controller.app_name"] = app.Name metadata["flynn-controller.release"] = release.ID job := &host.Job{ ID: id, Metadata: metadata, Config: host.ContainerConfig{ Env: env, TTY: newJob.TTY, Stdin: attach, DisableLog: newJob.DisableLog, }, Resources: newJob.Resources, } resource.SetDefaults(&job.Resources) if len(newJob.Args) > 0 { job.Config.Args = newJob.Args } if len(release.ArtifactIDs) > 0 { artifacts, err := c.artifactRepo.ListIDs(release.ArtifactIDs...) if err != nil { respondWithError(w, err) return } job.ImageArtifact = artifacts[release.ImageArtifactID()].HostArtifact() job.FileArtifacts = make([]*host.Artifact, len(release.FileArtifactIDs())) for i, id := range release.FileArtifactIDs() { job.FileArtifacts[i] = artifacts[id].HostArtifact() } } // ensure slug apps use /runner/init if release.IsGitDeploy() && (len(job.Config.Args) == 0 || job.Config.Args[0] != "/runner/init") { job.Config.Args = append([]string{"/runner/init"}, job.Config.Args...) } var attachClient cluster.AttachClient if attach { attachReq := &host.AttachReq{ JobID: job.ID, Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagStdin | host.AttachFlagStream, Height: uint16(newJob.Lines), Width: uint16(newJob.Columns), } attachClient, err = client.Attach(attachReq, true) if err != nil { respondWithError(w, fmt.Errorf("attach failed: %s", err.Error())) return } defer attachClient.Close() } if err := client.AddJob(job); err != nil { respondWithError(w, fmt.Errorf("schedule failed: %s", err.Error())) return } if attach { // TODO(titanous): This Wait could block indefinitely if something goes // wrong, a context should be threaded in that cancels if the client // goes away. if err := attachClient.Wait(); err != nil { respondWithError(w, fmt.Errorf("attach wait failed: %s", err.Error())) return } w.Header().Set("Connection", "upgrade") w.Header().Set("Upgrade", "flynn-attach/0") w.WriteHeader(http.StatusSwitchingProtocols) conn, _, err := w.(http.Hijacker).Hijack() if err != nil { panic(err) } defer conn.Close() done := make(chan struct{}, 2) cp := func(to io.Writer, from io.Reader) { io.Copy(to, from) done <- struct{}{} } go cp(conn, attachClient.Conn()) go cp(attachClient.Conn(), conn) // Wait for one of the connections to be closed or interrupted. EOF is // framed inside the attach protocol, so a read/write error indicates // that we're done and should clean up. <-done return } else { httphelper.JSON(w, 200, &ct.Job{ ID: job.ID, UUID: uuid, HostID: hostID, ReleaseID: newJob.ReleaseID, Args: newJob.Args, }) } }
func (c *Cmd) Start() error { if c.started { return errors.New("exec: already started") } c.done = make(chan struct{}) c.started = true if c.host == nil && c.cluster == nil { var err error c.cluster = cluster.NewClient() if err != nil { return err } c.closeCluster = true } if c.HostID == "" { hosts, err := c.cluster.Hosts() if err != nil { return err } if len(hosts) == 0 { return errors.New("exec: no hosts found") } host := schedutil.PickHost(hosts) c.HostID = host.ID() c.host = host } // Use the pre-defined host.Job configuration if provided; // otherwise generate one from the fields on exec.Cmd that mirror stdlib's os.exec. if c.Job == nil { c.Job = &host.Job{ ImageArtifact: &c.ImageArtifact, Config: host.ContainerConfig{ Args: c.Args, TTY: c.TTY, Env: c.Env, Stdin: c.Stdin != nil || c.stdinPipe != nil, }, Metadata: c.Meta, } // if attaching to stdout / stderr, avoid round tripping the // streams via on-disk log files. if c.Stdout != nil || c.Stderr != nil { c.Job.Config.DisableLog = true } } else { c.Job.ImageArtifact = &c.ImageArtifact } if c.Job.ID == "" { c.Job.ID = cluster.GenerateJobID(c.HostID, "") } if c.host == nil { var err error c.host, err = c.cluster.Host(c.HostID) if err != nil { return err } } if c.Stdout != nil || c.Stderr != nil || c.Stdin != nil || c.stdinPipe != nil { req := &host.AttachReq{ JobID: c.Job.ID, Height: c.TermHeight, Width: c.TermWidth, Flags: host.AttachFlagStream, } if c.Stdout != nil { req.Flags |= host.AttachFlagStdout } if c.Stderr != nil { req.Flags |= host.AttachFlagStderr } if c.Job.Config.Stdin { req.Flags |= host.AttachFlagStdin } var err error c.attachClient, err = c.host.Attach(req, true) if err != nil { c.close() return err } } if c.stdinPipe != nil { c.stdinPipe.set(writeCloseCloser{c.attachClient}) } else if c.Stdin != nil { go func() { io.Copy(c.attachClient, c.Stdin) c.attachClient.CloseWrite() }() } if c.attachClient == nil { c.eventChan = make(chan *host.Event) var err error c.eventStream, err = c.host.StreamEvents(c.Job.ID, c.eventChan) if err != nil { return err } } go func() { defer close(c.done) if c.attachClient != nil { c.exitStatus, c.streamErr = c.attachClient.Receive(c.Stdout, c.Stderr) } else { outer: for e := range c.eventChan { switch e.Event { case "stop": c.exitStatus = *e.Job.ExitStatus break outer case "error": c.streamErr = errors.New(*e.Job.Error) break outer } } c.eventStream.Close() if c.streamErr == nil { c.streamErr = c.eventStream.Err() } } }() return c.host.AddJob(c.Job) }
/* Restore prior state from the save location defined at construction time. If the state save file is empty, nothing is loaded, and no error is returned. */ func (s *State) Restore(backend Backend, buffers host.LogBuffers) (func(), error) { if err := s.Acquire(); err != nil { return nil, err } defer s.Release() s.backend = backend var resurrect []*host.Job if err := s.stateDB.View(func(tx *bolt.Tx) error { jobsBucket := tx.Bucket([]byte("jobs")) backendJobsBucket := tx.Bucket([]byte("backend-jobs")) backendGlobalBucket := tx.Bucket([]byte("backend-global")) persistentBucket := tx.Bucket([]byte("persistent-jobs")) // restore jobs if err := jobsBucket.ForEach(func(k, v []byte) error { job := &host.ActiveJob{} if err := json.Unmarshal(v, job); err != nil { return err } if job.ContainerID != "" { s.containers[job.ContainerID] = job } s.jobs[string(k)] = job return nil }); err != nil { return err } // hand opaque blobs back to backend so it can do its restore backendJobsBlobs := make(map[string][]byte) if err := backendJobsBucket.ForEach(func(k, v []byte) error { backendJobsBlobs[string(k)] = v return nil }); err != nil { return err } backendGlobalBlob := backendGlobalBucket.Get([]byte("backend")) if err := backend.UnmarshalState(s.jobs, backendJobsBlobs, backendGlobalBlob, buffers); err != nil { return err } s.mtx.Lock() if err := persistentBucket.ForEach(func(k, v []byte) error { for _, job := range s.jobs { if job.Job.ID == string(v) { resurrect = append(resurrect, job.Job) } } return nil }); err != nil { return err } s.mtx.Unlock() return nil }); err != nil && err != io.EOF { return nil, fmt.Errorf("could not restore from host persistence db: %s", err) } return func() { var wg sync.WaitGroup wg.Add(len(resurrect)) for _, job := range resurrect { go func(job *host.Job) { // generate a new job id, this is a new job newJob := job.Dup() newJob.ID = cluster.GenerateJobID(s.id, "") log.Printf("resurrecting %s as %s", job.ID, newJob.ID) s.AddJob(newJob) backend.Run(newJob, nil) wg.Done() }(job) } wg.Wait() }, nil }