Example #1
0
// TestMigrateJobStates checks that migrating to ID 9 does not break existing
// job records
func (MigrateSuite) TestMigrateJobStates(c *C) {
	db := setupTestDB(c, "controllertest_migrate_job_states")
	m := &testMigrator{c: c, db: db}

	// start from ID 7
	m.migrateTo(7)

	// insert a job
	hostID := "host1"
	uuid := random.UUID()
	jobID := cluster.GenerateJobID(hostID, uuid)
	appID := random.UUID()
	releaseID := random.UUID()
	c.Assert(db.Exec(`INSERT INTO apps (app_id, name) VALUES ($1, $2)`, appID, "migrate-app"), IsNil)
	c.Assert(db.Exec(`INSERT INTO releases (release_id) VALUES ($1)`, releaseID), IsNil)
	c.Assert(db.Exec(`INSERT INTO job_cache (job_id, app_id, release_id, state) VALUES ($1, $2, $3, $4)`, jobID, appID, releaseID, "up"), IsNil)

	// migrate to 8 and check job states are still constrained
	m.migrateTo(8)
	err := db.Exec(`UPDATE job_cache SET state = 'foo' WHERE job_id = $1`, jobID)
	c.Assert(err, NotNil)
	if !postgres.IsPostgresCode(err, postgres.ForeignKeyViolation) {
		c.Fatalf("expected postgres foreign key violation, got %s", err)
	}

	// migrate to 9 and check job IDs are correct, pending state is valid
	m.migrateTo(9)
	var clusterID, dbUUID, dbHostID string
	c.Assert(db.QueryRow("SELECT cluster_id, job_id, host_id FROM job_cache WHERE cluster_id = $1", jobID).Scan(&clusterID, &dbUUID, &dbHostID), IsNil)
	c.Assert(clusterID, Equals, jobID)
	c.Assert(dbUUID, Equals, uuid)
	c.Assert(dbHostID, Equals, hostID)
	c.Assert(db.Exec(`UPDATE job_cache SET state = 'pending' WHERE job_id = $1`, uuid), IsNil)
}
Example #2
0
func (f *ClusterFixer) StartAppJob(app, typ, service string) ([]*discoverd.Instance, error) {
	f.l.Info(fmt.Sprintf("no %s %s process running, getting release details from hosts", app, typ))
	releases := f.FindAppReleaseJobs(app, typ)
	if len(releases) == 0 {
		return nil, fmt.Errorf("didn't find any %s %s release jobs", app, typ)
	}

	// get a job template from the first release
	var job *host.Job
	for _, job = range releases[0] {
		break
	}
	host := f.hosts[0]
	job.ID = cluster.GenerateJobID(host.ID(), "")
	// provision new temporary volumes
	for i, v := range job.Config.Volumes {
		if v.DeleteOnStop {
			f.l.Info(fmt.Sprintf("provisioning volume for %s %s job", app, typ), "job.id", job.ID, "release", job.Metadata["flynn-controller.release"])
			vol, err := host.CreateVolume("default")
			if err != nil {
				return nil, fmt.Errorf("error provisioning volume for %s %s job: %s", app, typ, err)
			}
			job.Config.Volumes[i].VolumeID = vol.ID
		}
	}
	f.FixJobEnv(job)
	// run it on the host
	f.l.Info(fmt.Sprintf("starting %s %s job", app, typ), "job.id", job.ID, "release", job.Metadata["flynn-controller.release"])
	if err := host.AddJob(job); err != nil {
		return nil, fmt.Errorf("error starting %s %s job: %s", app, typ, err)
	}
	f.l.Info("waiting for job to start")
	return discoverd.GetInstances(service, time.Minute)
}
Example #3
0
func (s *S) TestJobGet(c *C) {
	app := s.createTestApp(c, &ct.App{Name: "job-get"})
	release := s.createTestRelease(c, &ct.Release{})
	s.createTestFormation(c, &ct.Formation{ReleaseID: release.ID, AppID: app.ID})
	uuid := random.UUID()
	hostID := "host0"
	jobID := cluster.GenerateJobID(hostID, uuid)
	s.createTestJob(c, &ct.Job{
		ID:        jobID,
		UUID:      uuid,
		HostID:    hostID,
		AppID:     app.ID,
		ReleaseID: release.ID,
		Type:      "web",
		State:     ct.JobStateStarting,
		Meta:      map[string]string{"some": "info"},
	})

	// test getting the job with both the job ID and the UUID
	for _, id := range []string{jobID, uuid} {
		job, err := s.c.GetJob(app.ID, id)
		c.Assert(err, IsNil)
		c.Assert(job.ID, Equals, jobID)
		c.Assert(job.UUID, Equals, uuid)
		c.Assert(job.HostID, Equals, hostID)
		c.Assert(job.AppID, Equals, app.ID)
		c.Assert(job.ReleaseID, Equals, release.ID)
		c.Assert(job.Meta, DeepEquals, map[string]string{"some": "info"})
	}
}
Example #4
0
func JobConfig(f *ct.ExpandedFormation, name, hostID string, uuid string) *host.Job {
	t := f.Release.Processes[name]
	env := make(map[string]string, len(f.Release.Env)+len(t.Env)+4)
	for k, v := range f.Release.Env {
		env[k] = v
	}
	for k, v := range t.Env {
		env[k] = v
	}
	id := cluster.GenerateJobID(hostID, uuid)
	env["FLYNN_APP_ID"] = f.App.ID
	env["FLYNN_APP_NAME"] = f.App.Name
	env["FLYNN_RELEASE_ID"] = f.Release.ID
	env["FLYNN_PROCESS_TYPE"] = name
	env["FLYNN_JOB_ID"] = id
	metadata := make(map[string]string, len(f.App.Meta)+4)
	for k, v := range f.App.Meta {
		metadata[k] = v
	}
	metadata["flynn-controller.app"] = f.App.ID
	metadata["flynn-controller.app_name"] = f.App.Name
	metadata["flynn-controller.release"] = f.Release.ID
	metadata["flynn-controller.formation"] = "true"
	metadata["flynn-controller.type"] = name
	job := &host.Job{
		ID:       id,
		Metadata: metadata,
		Config: host.ContainerConfig{
			Cmd:         t.Cmd,
			Env:         env,
			HostNetwork: t.HostNetwork,
		},
		Resurrect: t.Resurrect,
		Resources: t.Resources,
	}
	if f.App.Meta["flynn-system-app"] == "true" {
		job.Partition = "system"
	}
	if len(t.Entrypoint) > 0 {
		job.Config.Entrypoint = t.Entrypoint
	}
	if f.ImageArtifact != nil {
		job.ImageArtifact = f.ImageArtifact.HostArtifact()
	}
	if len(f.FileArtifacts) > 0 {
		job.FileArtifacts = make([]*host.Artifact, len(f.FileArtifacts))
		for i, artifact := range f.FileArtifacts {
			job.FileArtifacts[i] = artifact.HostArtifact()
		}
	}
	job.Config.Ports = make([]host.Port, len(t.Ports))
	for i, p := range t.Ports {
		job.Config.Ports[i].Proto = p.Proto
		job.Config.Ports[i].Port = p.Port
		job.Config.Ports[i].Service = p.Service
	}
	return job
}
Example #5
0
func (f *ClusterFixer) FixFlannel() error {
	f.l.Info("checking flannel")

	flannelJobs := make(map[string]*host.Job, len(f.hosts))
	for _, h := range f.hosts {
		jobs, err := h.ListJobs()
		if err != nil {
			return fmt.Errorf("error getting jobs list from %s: %s", h.ID(), err)
		}
		for _, j := range jobs {
			if j.Status != host.StatusRunning ||
				j.Job.Metadata["flynn-controller.app_name"] != "flannel" ||
				j.Job.Metadata["flynn-controller.type"] != "app" {
				continue
			}
			flannelJobs[h.ID()] = j.Job
			break
		}
	}
	if len(flannelJobs) == len(f.hosts) {
		f.l.Info("flannel looks good")
		return nil
	}

	var job *host.Job
	if len(flannelJobs) == 0 {
		f.l.Info("flannel not running, starting it on each host")
		releases := f.FindAppReleaseJobs("flannel", "app")
		if len(releases) == 0 {
			return fmt.Errorf("didn't find flannel release jobs")
		}
		for _, j := range releases[0] {
			job = j
			break
		}
	} else {
		f.l.Info("flannel is not running on each host, starting missing jobs")
		for _, job = range flannelJobs {
			break
		}
	}

	for _, h := range f.hosts {
		if _, ok := flannelJobs[h.ID()]; ok {
			continue
		}
		job.ID = cluster.GenerateJobID(h.ID(), "")
		f.FixJobEnv(job)
		if err := h.AddJob(job); err != nil {
			return fmt.Errorf("error starting flannel job: %s", err)
		}
		f.l.Info("started flannel job", "job.id", job.ID)
	}

	f.l.Info("flannel fix complete")

	return nil
}
Example #6
0
func (s *S) TestKillJob(c *C) {
	app := s.createTestApp(c, &ct.App{Name: "killjob"})
	hostID := fakeHostID()
	jobID := cluster.GenerateJobID(hostID)
	hc := tu.NewFakeHostClient(hostID)
	s.cc.AddHost(hc)

	c.Assert(s.c.DeleteJob(app.ID, jobID), IsNil)
	c.Assert(hc.IsStopped(jobID), Equals, true)
}
Example #7
0
func (c *FakeHostClient) CrashJob(uuid string) error {
	c.jobsMtx.Lock()
	defer c.jobsMtx.Unlock()
	id := cluster.GenerateJobID(c.hostID, uuid)
	c.stopped[id] = true
	job, ok := c.Jobs[id]
	if ok {
		job.Status = host.StatusCrashed
		c.Jobs[id] = job
		return c.stop(id)
	} else {
		return ct.NotFoundError{Resource: id}
	}
}
Example #8
0
func JobConfig(f *ct.ExpandedFormation, name, hostID string) *host.Job {
	t := f.Release.Processes[name]
	env := make(map[string]string, len(f.Release.Env)+len(t.Env)+4)
	for k, v := range f.Release.Env {
		env[k] = v
	}
	for k, v := range t.Env {
		env[k] = v
	}
	id := cluster.GenerateJobID(hostID)
	env["FLYNN_APP_ID"] = f.App.ID
	env["FLYNN_APP_NAME"] = f.App.Name
	env["FLYNN_RELEASE_ID"] = f.Release.ID
	env["FLYNN_PROCESS_TYPE"] = name
	env["FLYNN_JOB_ID"] = id
	job := &host.Job{
		ID: id,
		Metadata: map[string]string{
			"flynn-controller.app":      f.App.ID,
			"flynn-controller.app_name": f.App.Name,
			"flynn-controller.release":  f.Release.ID,
			"flynn-controller.type":     name,
		},
		Artifact: host.Artifact{
			Type: f.Artifact.Type,
			URI:  f.Artifact.URI,
		},
		Config: host.ContainerConfig{
			Cmd:         t.Cmd,
			Env:         env,
			HostNetwork: t.HostNetwork,
		},
		Resurrect: t.Resurrect,
		Resources: t.Resources,
	}
	if len(t.Entrypoint) > 0 {
		job.Config.Entrypoint = t.Entrypoint
	}
	job.Config.Ports = make([]host.Port, len(t.Ports))
	for i, p := range t.Ports {
		job.Config.Ports[i].Proto = p.Proto
		job.Config.Ports[i].Port = p.Port
		job.Config.Ports[i].Service = p.Service
	}
	return job
}
Example #9
0
func (f *ClusterFixer) StartAppJob(app, typ, service string) ([]*discoverd.Instance, error) {
	f.l.Info(fmt.Sprintf("no %s %s process running, getting release details from hosts", app, typ))
	releases := f.FindAppReleaseJobs(app, typ)
	if len(releases) == 0 {
		return nil, fmt.Errorf("didn't find any %s %s release jobs", app, typ)
	}

	// get a job template from the first release
	var job *host.Job
	for _, job = range releases[0] {
		break
	}
	job.ID = cluster.GenerateJobID(f.hosts[0].ID(), "")
	f.FixJobEnv(job)
	// run it on a host
	f.l.Info(fmt.Sprintf("starting %s %s job", app, typ), "job.id", job.ID, "release", job.Metadata["flynn-controller.release"])
	if err := f.hosts[0].AddJob(job); err != nil {
		return nil, fmt.Errorf("error starting %s %s job: %s", app, typ, err)
	}
	f.l.Info("waiting for job to start")
	return discoverd.GetInstances(service, time.Minute)
}
Example #10
0
func (f *ClusterFixer) FixDiscoverd() error {
	f.l.Info("ensuring discoverd is running on all hosts")
	releases := f.FindAppReleaseJobs("discoverd", "app")
	if len(releases) == 0 {
		return fmt.Errorf("didn't find any discoverd release jobs")
	}
outer:
	for hostID, job := range releases[0] {
		for _, h := range f.hosts {
			if h.ID() != hostID {
				continue
			}

			// check if discoverd is already running on this host
			jobs, err := h.ListJobs()
			if err != nil {
				return fmt.Errorf("error listing jobs on %s: %s", h.ID(), err)
			}
			for _, j := range jobs {
				if j.Status == host.StatusRunning &&
					j.Job.Metadata["flynn-controller.app_name"] == "discoverd" &&
					j.Job.Metadata["flynn-controller.type"] == "app" {
					continue outer
				}
			}

			job.ID = cluster.GenerateJobID(h.ID(), "")
			f.FixJobEnv(job)
			if err := h.AddJob(job); err != nil {
				return fmt.Errorf("error starting discoverd on %s: %s", h.ID(), err)
			}
			f.l.Info("started discoverd instance", "job.id", job.ID)
			break
		}
	}
	return nil
}
Example #11
0
func (s *S) TestKillJob(c *C) {
	app := s.createTestApp(c, &ct.App{Name: "killjob"})
	release := s.createTestRelease(c, &ct.Release{})
	hostID := fakeHostID()
	uuid := random.UUID()
	jobID := cluster.GenerateJobID(hostID, uuid)
	s.createTestJob(c, &ct.Job{
		ID:        jobID,
		UUID:      uuid,
		HostID:    hostID,
		AppID:     app.ID,
		ReleaseID: release.ID,
		Type:      "web",
		State:     ct.JobStateStarting,
		Meta:      map[string]string{"some": "info"},
	})
	hc := tu.NewFakeHostClient(hostID, false)
	hc.AddJob(&host.Job{ID: jobID})
	s.cc.AddHost(hc)

	err := s.c.DeleteJob(app.ID, jobID)
	c.Assert(err, IsNil)
	c.Assert(hc.IsStopped(jobID), Equals, true)
}
Example #12
0
func (c *controllerAPI) RunJob(ctx context.Context, w http.ResponseWriter, req *http.Request) {
	var newJob ct.NewJob
	if err := httphelper.DecodeJSON(req, &newJob); err != nil {
		respondWithError(w, err)
		return
	}

	if err := schema.Validate(newJob); err != nil {
		respondWithError(w, err)
		return
	}

	data, err := c.releaseRepo.Get(newJob.ReleaseID)
	if err != nil {
		respondWithError(w, err)
		return
	}
	release := data.(*ct.Release)
	var artifactIDs []string
	if len(newJob.ArtifactIDs) > 0 {
		artifactIDs = newJob.ArtifactIDs
	} else if len(release.ArtifactIDs) > 0 {
		artifactIDs = release.ArtifactIDs
	} else {
		httphelper.ValidationError(w, "release.ArtifactIDs", "cannot be empty")
		return
	}

	artifacts := make([]*ct.Artifact, len(artifactIDs))
	artifactList, err := c.artifactRepo.ListIDs(artifactIDs...)
	if err != nil {
		respondWithError(w, err)
		return
	}
	for i, id := range artifactIDs {
		artifacts[i] = artifactList[id]
	}

	var entrypoint ct.ImageEntrypoint
	if e := utils.GetEntrypoint(artifacts, ""); e != nil {
		entrypoint = *e
	}

	attach := strings.Contains(req.Header.Get("Upgrade"), "flynn-attach/0")

	hosts, err := c.clusterClient.Hosts()
	if err != nil {
		respondWithError(w, err)
		return
	}
	if len(hosts) == 0 {
		respondWithError(w, errors.New("no hosts found"))
		return
	}
	client := hosts[random.Math.Intn(len(hosts))]

	uuid := random.UUID()
	hostID := client.ID()
	id := cluster.GenerateJobID(hostID, uuid)
	app := c.getApp(ctx)
	env := make(map[string]string, len(entrypoint.Env)+len(release.Env)+len(newJob.Env)+4)
	env["FLYNN_APP_ID"] = app.ID
	env["FLYNN_RELEASE_ID"] = release.ID
	env["FLYNN_PROCESS_TYPE"] = ""
	env["FLYNN_JOB_ID"] = id
	for k, v := range entrypoint.Env {
		env[k] = v
	}
	if newJob.ReleaseEnv {
		for k, v := range release.Env {
			env[k] = v
		}
	}
	for k, v := range newJob.Env {
		env[k] = v
	}
	metadata := make(map[string]string, len(newJob.Meta)+3)
	for k, v := range newJob.Meta {
		metadata[k] = v
	}
	metadata["flynn-controller.app"] = app.ID
	metadata["flynn-controller.app_name"] = app.Name
	metadata["flynn-controller.release"] = release.ID
	job := &host.Job{
		ID:       id,
		Metadata: metadata,
		Config: host.ContainerConfig{
			Args:       entrypoint.Args,
			Env:        env,
			WorkingDir: entrypoint.WorkingDir,
			Uid:        entrypoint.Uid,
			Gid:        entrypoint.Gid,
			TTY:        newJob.TTY,
			Stdin:      attach,
			DisableLog: newJob.DisableLog,
		},
		Resources: newJob.Resources,
		Partition: string(newJob.Partition),
	}
	resource.SetDefaults(&job.Resources)
	if len(newJob.Args) > 0 {
		job.Config.Args = newJob.Args
	}
	utils.SetupMountspecs(job, artifacts)

	// provision data volume if required
	if newJob.Data {
		vol := &ct.VolumeReq{Path: "/data", DeleteOnStop: true}
		if _, err := utils.ProvisionVolume(vol, client, job); err != nil {
			respondWithError(w, err)
			return
		}
	}

	var attachClient cluster.AttachClient
	if attach {
		attachReq := &host.AttachReq{
			JobID:  job.ID,
			Flags:  host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagStdin | host.AttachFlagStream,
			Height: uint16(newJob.Lines),
			Width:  uint16(newJob.Columns),
		}
		attachClient, err = client.Attach(attachReq, true)
		if err != nil {
			respondWithError(w, fmt.Errorf("attach failed: %s", err.Error()))
			return
		}
		defer attachClient.Close()
	}

	if err := client.AddJob(job); err != nil {
		respondWithError(w, fmt.Errorf("schedule failed: %s", err.Error()))
		return
	}

	if attach {
		// TODO(titanous): This Wait could block indefinitely if something goes
		// wrong, a context should be threaded in that cancels if the client
		// goes away.
		if err := attachClient.Wait(); err != nil {
			respondWithError(w, fmt.Errorf("attach wait failed: %s", err.Error()))
			return
		}
		w.Header().Set("Connection", "upgrade")
		w.Header().Set("Upgrade", "flynn-attach/0")
		w.WriteHeader(http.StatusSwitchingProtocols)
		conn, _, err := w.(http.Hijacker).Hijack()
		if err != nil {
			panic(err)
		}
		defer conn.Close()

		done := make(chan struct{}, 2)
		cp := func(to io.Writer, from io.Reader) {
			io.Copy(to, from)
			done <- struct{}{}
		}
		go cp(conn, attachClient.Conn())
		go cp(attachClient.Conn(), conn)

		// Wait for one of the connections to be closed or interrupted. EOF is
		// framed inside the attach protocol, so a read/write error indicates
		// that we're done and should clean up.
		<-done

		return
	} else {
		httphelper.JSON(w, 200, &ct.Job{
			ID:        job.ID,
			UUID:      uuid,
			HostID:    hostID,
			ReleaseID: newJob.ReleaseID,
			Args:      newJob.Args,
		})
	}
}
Example #13
0
func (c *controllerAPI) RunJob(ctx context.Context, w http.ResponseWriter, req *http.Request) {
	var newJob ct.NewJob
	if err := httphelper.DecodeJSON(req, &newJob); err != nil {
		respondWithError(w, err)
		return
	}

	if err := schema.Validate(newJob); err != nil {
		respondWithError(w, err)
		return
	}

	data, err := c.releaseRepo.Get(newJob.ReleaseID)
	if err != nil {
		respondWithError(w, err)
		return
	}
	release := data.(*ct.Release)
	data, err = c.artifactRepo.Get(release.ArtifactID)
	if err != nil {
		respondWithError(w, err)
		return
	}
	artifact := data.(*ct.Artifact)
	attach := strings.Contains(req.Header.Get("Upgrade"), "flynn-attach/0")

	hosts, err := c.clusterClient.Hosts()
	if err != nil {
		respondWithError(w, err)
		return
	}
	if len(hosts) == 0 {
		respondWithError(w, errors.New("no hosts found"))
		return
	}
	client := hosts[random.Math.Intn(len(hosts))]

	id := cluster.GenerateJobID(client.ID(), "")
	app := c.getApp(ctx)
	env := make(map[string]string, len(release.Env)+len(newJob.Env)+4)
	env["FLYNN_APP_ID"] = app.ID
	env["FLYNN_RELEASE_ID"] = release.ID
	env["FLYNN_PROCESS_TYPE"] = ""
	env["FLYNN_JOB_ID"] = id
	if newJob.ReleaseEnv {
		for k, v := range release.Env {
			env[k] = v
		}
	}
	for k, v := range newJob.Env {
		env[k] = v
	}
	metadata := make(map[string]string, len(newJob.Meta)+3)
	for k, v := range newJob.Meta {
		metadata[k] = v
	}
	metadata["flynn-controller.app"] = app.ID
	metadata["flynn-controller.app_name"] = app.Name
	metadata["flynn-controller.release"] = release.ID
	job := &host.Job{
		ID:       id,
		Metadata: metadata,
		Artifact: host.Artifact{
			Type: artifact.Type,
			URI:  artifact.URI,
		},
		Config: host.ContainerConfig{
			Cmd:        newJob.Cmd,
			Env:        env,
			TTY:        newJob.TTY,
			Stdin:      attach,
			DisableLog: newJob.DisableLog,
		},
		Resources: newJob.Resources,
	}
	resource.SetDefaults(&job.Resources)
	if len(newJob.Entrypoint) > 0 {
		job.Config.Entrypoint = newJob.Entrypoint
	}

	var attachClient cluster.AttachClient
	if attach {
		attachReq := &host.AttachReq{
			JobID:  job.ID,
			Flags:  host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagStdin | host.AttachFlagStream,
			Height: uint16(newJob.Lines),
			Width:  uint16(newJob.Columns),
		}
		attachClient, err = client.Attach(attachReq, true)
		if err != nil {
			respondWithError(w, fmt.Errorf("attach failed: %s", err.Error()))
			return
		}
		defer attachClient.Close()
	}

	if err := client.AddJob(job); err != nil {
		respondWithError(w, fmt.Errorf("schedule failed: %s", err.Error()))
		return
	}

	if attach {
		if err := attachClient.Wait(); err != nil {
			respondWithError(w, fmt.Errorf("attach wait failed: %s", err.Error()))
			return
		}
		w.Header().Set("Connection", "upgrade")
		w.Header().Set("Upgrade", "flynn-attach/0")
		w.WriteHeader(http.StatusSwitchingProtocols)
		conn, _, err := w.(http.Hijacker).Hijack()
		if err != nil {
			panic(err)
		}
		defer conn.Close()

		done := make(chan struct{}, 2)
		cp := func(to io.Writer, from io.Reader) {
			io.Copy(to, from)
			done <- struct{}{}
		}
		go cp(conn, attachClient.Conn())
		go cp(attachClient.Conn(), conn)
		<-done
		<-done

		return
	} else {
		httphelper.JSON(w, 200, &ct.Job{
			ID:        job.ID,
			ReleaseID: newJob.ReleaseID,
			Cmd:       newJob.Cmd,
		})
	}
}
Example #14
0
func (f *ClusterFixer) FixPostgres() error {
	f.l.Info("checking postgres")
	service := discoverd.NewService("postgres")
	leader, _ := service.Leader()
	if leader == nil || leader.Addr == "" {
		f.l.Info("no running postgres leader")
		leader = nil
	} else {
		f.l.Info("found running postgres leader")
	}
	instances, _ := service.Instances()
	f.l.Info(fmt.Sprintf("found %d running postgres instances", len(instances)))

	f.l.Info("getting postgres status")
	var status *pgmanager.Status
	if leader != nil && leader.Addr != "" {
		client := pgmanager.NewClient(leader.Addr)
		var err error
		status, err = client.Status()
		if err != nil {
			f.l.Error("error getting status from postgres leader", "error", err)
		}
	}
	if status != nil && status.Postgres.ReadWrite {
		f.l.Info("postgres claims to be read-write")
		return nil
	}

	f.l.Info("getting postgres service metadata")
	meta, err := discoverd.NewService("postgres").GetMeta()
	if err != nil {
		return fmt.Errorf("error getting postgres state from discoverd: %s", err)
	}

	var state pgstate.State
	if err := json.Unmarshal(meta.Data, &state); err != nil {
		return fmt.Errorf("error decoding postgres state: %s", err)
	}
	if state.Primary == nil {
		return fmt.Errorf("no primary in postgres state")
	}

	f.l.Info("getting postgres primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"])
	job, host, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"])
	if err != nil {
		if state.Sync != nil {
			f.l.Error("unable to get primary job info", "error", err)
			f.l.Info("getting postgres sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"])
			job, host, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"])
			if err != nil {
				return fmt.Errorf("unable to get postgres primary or sync job details: %s", err)
			}
		} else {
			return fmt.Errorf("unable to get postgres primary job details: %s", err)
		}
	}

	if leader != nil && state.Singleton {
		return fmt.Errorf("postgres leader is running in singleton mode, unable to fix")
	}

	waitForInstance := func(jobID string) (func() (string, error), error) {
		watchCh := make(chan *discoverd.Event)
		upCh := make(chan string)
		stream, err := service.Watch(watchCh)
		if err != nil {
			return nil, fmt.Errorf("error watching discoverd service: %s", err)
		}
		go func() {
			var current bool
			for event := range watchCh {
				if event.Kind == discoverd.EventKindCurrent {
					current = true
					continue
				}
				if !current || event.Kind != discoverd.EventKindUp {
					continue
				}
				if event.Instance.Meta["FLYNN_JOB_ID"] == jobID {
					upCh <- event.Instance.Addr
				}
			}
		}()
		return func() (string, error) {
			f.l.Info("waiting for postgres instance to start", "job.id", jobID)
			defer stream.Close()
			select {
			case addr := <-upCh:
				return addr, nil
			case <-time.After(time.Minute):
				return "", fmt.Errorf("timed out waiting for postgres instance to come up")
			}
		}, nil
	}

	var wait func() (string, error)
	have := len(instances)
	want := 2
	if state.Singleton {
		want = 1
	}
	if have >= want {
		return fmt.Errorf("already have enough postgres instances, unable to fix")
	}
	f.l.Info("attempting to start missing postgres jobs", "want", want, "have", have)
	if leader == nil {
		// if no postgres, attempt to start
		job.ID = cluster.GenerateJobID(host.ID(), "")
		f.FixJobEnv(job)
		f.l.Info("starting postgres primary job", "job.id", job.ID)
		wait, err = waitForInstance(job.ID)
		if err != nil {
			return err
		}
		if err := host.AddJob(job); err != nil {
			return fmt.Errorf("error starting postgres primary job on %s: %s", host.ID(), err)
		}
		have++
	}
	if want > have {
		// if not enough postgres instances, start another
		var secondHost *cluster.Host
		for _, h := range f.hosts {
			if h.ID() != host.ID() {
				secondHost = h
				break
			}
		}
		if secondHost == nil {
			// if there are no other hosts, use the same one we put the primary on
			secondHost = host
		}
		job.ID = cluster.GenerateJobID(secondHost.ID(), "")
		f.FixJobEnv(job)
		f.l.Info("starting second postgres job", "job.id", job.ID)
		if wait == nil {
			wait, err = waitForInstance(job.ID)
			if err != nil {
				return err
			}
		}
		if err := utils.ProvisionVolume(secondHost, job); err != nil {
			return fmt.Errorf("error creating postgres volume on %s: %s", secondHost.ID(), err)
		}
		if err := secondHost.AddJob(job); err != nil {
			return fmt.Errorf("error starting additional postgres job on %s: %s", secondHost.ID(), err)
		}
	}

	if wait != nil {
		addr, err := wait()
		if err != nil {
			return err
		}
		if leader != nil {
			addr = leader.Addr
		}
		f.l.Info("waiting for postgres to come up read-write")
		return pgmanager.NewClient(addr).WaitForReadWrite(5 * time.Minute)
	}
	return nil
}
Example #15
0
/*
	Restore prior state from the save location defined at construction time.
	If the state save file is empty, nothing is loaded, and no error is returned.
*/
func (s *State) Restore(backend Backend, buffers host.LogBuffers) (func(), error) {
	if err := s.Acquire(); err != nil {
		return nil, err
	}
	defer s.Release()

	s.backend = backend

	var resurrect []*host.Job
	if err := s.stateDB.View(func(tx *bolt.Tx) error {
		jobsBucket := tx.Bucket([]byte("jobs"))
		backendJobsBucket := tx.Bucket([]byte("backend-jobs"))
		backendGlobalBucket := tx.Bucket([]byte("backend-global"))
		persistentBucket := tx.Bucket([]byte("persistent-jobs"))

		// restore jobs
		if err := jobsBucket.ForEach(func(k, v []byte) error {
			job := &host.ActiveJob{}
			if err := json.Unmarshal(v, job); err != nil {
				return err
			}
			if job.CreatedAt.IsZero() {
				job.CreatedAt = time.Now()
			}
			s.jobs[string(k)] = job

			return nil
		}); err != nil {
			return err
		}

		// hand opaque blobs back to backend so it can do its restore
		backendJobsBlobs := make(map[string][]byte)
		if err := backendJobsBucket.ForEach(func(k, v []byte) error {
			backendJobsBlobs[string(k)] = v
			return nil
		}); err != nil {
			return err
		}
		backendGlobalBlob := backendGlobalBucket.Get([]byte("backend"))
		if err := backend.UnmarshalState(s.jobs, backendJobsBlobs, backendGlobalBlob, buffers); err != nil {
			return err
		}

		// resurrect any persistent jobs which are not running
		if err := persistentBucket.ForEach(func(k, v []byte) error {
			for _, job := range s.jobs {
				if job.Job.ID == string(v) && !backend.JobExists(job.Job.ID) {
					resurrect = append(resurrect, job.Job)
				}
			}
			return nil
		}); err != nil {
			return err
		}

		return nil
	}); err != nil && err != io.EOF {
		return nil, fmt.Errorf("could not restore from host persistence db: %s", err)
	}

	return func() {
		if len(resurrect) == 0 {
			return
		}
		var wg sync.WaitGroup
		wg.Add(len(resurrect))
		for _, job := range resurrect {
			go func(job *host.Job) {
				// generate a new job id, this is a new job
				newJob := job.Dup()
				newJob.ID = cluster.GenerateJobID(s.id, "")
				if _, ok := newJob.Config.Env["FLYNN_JOB_ID"]; ok {
					newJob.Config.Env["FLYNN_JOB_ID"] = newJob.ID
				}
				log.Printf("resurrecting %s as %s", job.ID, newJob.ID)
				s.AddJob(newJob)
				backend.Run(newJob, nil, nil)
				wg.Done()
			}(job)
		}
		wg.Wait()
	}, nil
}
Example #16
0
func (f *ClusterFixer) FixSirenia(svc string) error {
	log := f.l.New("fn", "FixSirenia", "service", svc)

	service := discoverd.NewService(svc)
	instances, _ := service.Instances()
	leader, _ := service.Leader()

	log.Info("getting service metadata")
	meta, err := service.GetMeta()
	if err != nil {
		return fmt.Errorf("error getting sirenia state from discoverd: %s", err)
	}

	var state state.State
	if err := json.Unmarshal(meta.Data, &state); err != nil {
		return fmt.Errorf("error decoding state: %s", err)
	}
	if state.Primary == nil {
		return fmt.Errorf("no primary in sirenia state")
	}

	log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"])
	primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"])
	if err != nil {
		log.Error("unable to get primary job info")
	}
	var syncJob *host.Job
	var syncHost *cluster.Host
	if state.Sync != nil {
		log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"])
		syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"])
		if err != nil {
			log.Error("unable to get sync job info")
		}
	}

	waitForInstance := func(jobID string) (func() (string, error), error) {
		watchCh := make(chan *discoverd.Event)
		upCh := make(chan string)
		stream, err := service.Watch(watchCh)
		if err != nil {
			return nil, fmt.Errorf("error watching discoverd service: %s", err)
		}
		go func() {
			var current bool
			for event := range watchCh {
				if event.Kind == discoverd.EventKindCurrent {
					current = true
					continue
				}
				if !current || event.Kind != discoverd.EventKindUp {
					continue
				}
				if event.Instance.Meta["FLYNN_JOB_ID"] == jobID {
					upCh <- event.Instance.Addr
				}
			}
		}()
		return func() (string, error) {
			log.Info("waiting for instance to start", "job.id", jobID)
			defer stream.Close()
			select {
			case addr := <-upCh:
				return addr, nil
			case <-time.After(time.Minute):
				return "", fmt.Errorf("timed out waiting for sirenia instance to come up")
			}
		}, nil
	}

	log.Info("terminating unassigned sirenia instances")
outer:
	for _, i := range instances {
		if i.Addr == state.Primary.Addr || (state.Sync != nil && i.Addr == state.Sync.Addr) {
			continue
		}
		for _, a := range state.Async {
			if i.Addr == a.Addr {
				continue outer
			}
		}
		// job not assigned in state, attempt to terminate it
		if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok {
			hostID, err := cluster.ExtractHostID(jobID)
			if err != nil {
				log.Error("error extracting host id from jobID", "jobID", jobID, "err", err)
			}
			h := f.Host(hostID)
			if h != nil {
				if err := h.StopJob(jobID); err != nil {
					log.Error("error stopping unassigned sirenia job", "jobID", jobID)
				}
			} else {
				log.Error("host not found", "hostID", hostID)
			}
		}
	}

	isRunning := func(addr string) bool {
		for _, i := range instances {
			if i.Addr == addr {
				return true
			}
		}
		return false
	}

	// if the leader isn't currently running then start it using primaryJob/primaryHost
	var wait func() (string, error)
	if !isRunning(state.Primary.Addr) {
		// if we don't have info about the primary job attempt to promote the sync
		if primaryJob == nil {
			if syncJob != nil {
				// set primary job to sync
				primaryJob = syncJob
				primaryHost = syncHost

				// nil out sync job now so we can re-allocate it.
				syncJob = nil
				syncHost = nil
			} else {
				return fmt.Errorf("neither primary or sync job info available")
			}
		}

		primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "")
		f.FixJobEnv(primaryJob)
		log.Info("starting primary job", "job.id", primaryJob.ID)
		wait, err = waitForInstance(primaryJob.ID)
		if err != nil {
			return err
		}
		if err := primaryHost.AddJob(primaryJob); err != nil {
			return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err)
		}
	}
	if !state.Singleton && !isRunning(state.Sync.Addr) {
		if syncHost == nil {
			for _, h := range f.hosts {
				if h.ID() != primaryHost.ID() {
					syncHost = h
					break
				}
			}
			if syncHost == nil {
				// if there are no other hosts, use the same one we put the primary on
				syncHost = primaryHost
			}
		}
		// if we don't have a sync job then copy the primary job
		// and provision a new volume
		if syncJob == nil {
			syncJob = primaryJob
			vol := &ct.VolumeReq{Path: "/data"}
			if _, err := utils.ProvisionVolume(vol, syncHost, syncJob); err != nil {
				return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err)
			}
		}
		syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "")
		f.FixJobEnv(syncJob)
		log.Info("starting sync job", "job.id", syncJob.ID)
		if wait == nil {
			wait, err = waitForInstance(syncJob.ID)
			if err != nil {
				return err
			}
		}
		if err := syncHost.AddJob(syncJob); err != nil {
			return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err)
		}
	}

	if wait != nil {
		addr, err := wait()
		if err != nil {
			return err
		}
		if leader != nil && leader.Addr != "" {
			addr = leader.Addr
		}
		log.Info("waiting for cluster to come up read-write", "addr", addr)
		return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute)
	}
	return nil
}
Example #17
0
func (TestSuite) TestMultipleHosts(c *C) {
	hosts := newTestHosts()
	fakeCluster := newTestCluster(hosts)
	s := newTestScheduler(c, fakeCluster, true)

	// use incremental job IDs so we can find them easily in s.jobs
	var jobID uint64
	s.generateJobUUID = func() string {
		return fmt.Sprintf("job%d", atomic.AddUint64(&jobID, 1))
	}
	s.maxHostChecks = 1

	go s.Run()
	defer s.Stop()

	assertJobs := func(expected map[string]*Job) {
		jobs := s.Jobs()
		c.Assert(jobs, HasLen, len(expected))
		for id, job := range expected {
			actual, ok := jobs[id]
			if !ok {
				c.Fatalf("%s does not exist in s.jobs", id)
			}
			c.Assert(actual.Type, Equals, job.Type)
			c.Assert(actual.state, Equals, job.state)
			c.Assert(actual.HostID, Equals, job.HostID)
		}
	}

	c.Log("Initialize the cluster with 1 host and wait for a job to start on it.")
	s.waitJobStart()
	assertJobs(map[string]*Job{
		"job1": {Type: "web", state: JobStateStarting, HostID: testHostID},
	})

	c.Log("Add a host to the cluster, then create a new app, artifact, release, and associated formation.")
	h2 := NewFakeHostClient("host2")
	fakeCluster.AddHost(h2)
	hosts[h2.ID()] = h2
	app := &ct.App{ID: "test-app-2", Name: "test-app-2"}
	artifact := &ct.Artifact{ID: "test-artifact-2"}
	processes := map[string]int{"omni": 1}
	release := NewReleaseOmni("test-release-2", artifact, processes, true)
	c.Log("Add the formation to the controller. Wait for formation change and job start on both hosts.")
	s.CreateApp(app)
	s.CreateArtifact(artifact)
	s.CreateRelease(release)
	s.PutFormation(&ct.Formation{AppID: app.ID, ReleaseID: release.ID, Processes: processes})
	s.waitFormationChange()
	s.waitJobStart()
	s.waitJobStart()
	assertJobs(map[string]*Job{
		"job1": {Type: "web", state: JobStateStarting, HostID: "host1"},
		"job2": {Type: "omni", state: JobStateStarting, HostID: "host1"},
		"job3": {Type: "omni", state: JobStateStarting, HostID: "host2"},
	})

	assertHostJobs := func(host *FakeHostClient, ids ...string) {
		jobs, err := host.ListJobs()
		c.Assert(err, IsNil)
		c.Assert(jobs, HasLen, len(ids))
		for _, id := range ids {
			id = cluster.GenerateJobID(host.ID(), id)
			job, ok := jobs[id]
			if !ok {
				c.Fatalf("%s missing job with ID %s", host.ID(), id)
			}
			c.Assert(job.Job.ID, Equals, id)
		}
	}
	h1 := hosts[testHostID]
	assertHostJobs(h1, "job1", "job2")
	assertHostJobs(h2, "job3")

	h3 := NewFakeHostClient("host3")
	c.Log("Add a host, wait for omni job start on that host.")
	fakeCluster.AddHost(h3)
	s.waitJobStart()
	assertJobs(map[string]*Job{
		"job1": {Type: "web", state: JobStateStarting, HostID: "host1"},
		"job2": {Type: "omni", state: JobStateStarting, HostID: "host1"},
		"job3": {Type: "omni", state: JobStateStarting, HostID: "host2"},
		"job4": {Type: "omni", state: JobStateStarting, HostID: "host3"},
	})
	assertHostJobs(h3, "job4")

	c.Log("Crash one of the omni jobs, and wait for it to restart")
	h3.CrashJob("job4")
	s.waitJobStop()
	s.waitJobStart()
	s.waitRectify()
	assertJobs(map[string]*Job{
		"job1": {Type: "web", state: JobStateStarting, HostID: "host1"},
		"job2": {Type: "omni", state: JobStateStarting, HostID: "host1"},
		"job3": {Type: "omni", state: JobStateStarting, HostID: "host2"},
		"job4": {Type: "omni", state: JobStateStopped, HostID: "host3"},
		"job5": {Type: "omni", state: JobStateStarting, HostID: "host3"},
	})
	assertHostJobs(h3, "job5")

	c.Log("Unbalance the omni jobs, wait for them to be re-balanced")

	// pause the scheduler so we can unbalance the jobs without it trying
	// to rectify the situation
	s.Pause()

	// move host3's job to host2
	id := cluster.GenerateJobID(h3.ID(), "job5")
	job, err := h3.GetJob(id)
	c.Assert(err, IsNil)
	newJob := job.Job.Dup()
	newJob.ID = cluster.GenerateJobID(h2.ID(), s.generateJobUUID())
	h2.AddJob(newJob)
	err = h3.StopJob(id)
	c.Assert(err, IsNil)

	// resume the scheduler and check it moves the job back to host3
	s.Resume()
	s.waitRectify()
	s.waitJobStart()
	assertJobs(map[string]*Job{
		"job1": {Type: "web", state: JobStateStarting, HostID: "host1"},
		"job2": {Type: "omni", state: JobStateStarting, HostID: "host1"},
		"job3": {Type: "omni", state: JobStateStarting, HostID: "host2"},
		"job4": {Type: "omni", state: JobStateStopped, HostID: "host3"},
		"job5": {Type: "omni", state: JobStateStopped, HostID: "host3"},
		"job6": {Type: "omni", state: JobStateStopped, HostID: "host2"},
		"job7": {Type: "omni", state: JobStateStarting, HostID: "host3"},
	})

	c.Logf("Remove one of the hosts. Ensure the cluster recovers correctly (hosts=%v)", hosts)
	h3.Healthy = false
	fakeCluster.SetHosts(hosts)
	s.waitFormationSync()
	s.waitRectify()
	assertJobs(map[string]*Job{
		"job1": {Type: "web", state: JobStateStarting, HostID: "host1"},
		"job2": {Type: "omni", state: JobStateStarting, HostID: "host1"},
		"job3": {Type: "omni", state: JobStateStarting, HostID: "host2"},
		"job4": {Type: "omni", state: JobStateStopped, HostID: "host3"},
		"job5": {Type: "omni", state: JobStateStopped, HostID: "host3"},
		"job6": {Type: "omni", state: JobStateStopped, HostID: "host2"},
		"job7": {Type: "omni", state: JobStateStopped, HostID: "host3"},
	})
	assertHostJobs(h1, "job1", "job2")
	assertHostJobs(h2, "job3")

	c.Logf("Remove another host. Ensure the cluster recovers correctly (hosts=%v)", hosts)
	h1.Healthy = false
	fakeCluster.RemoveHost(testHostID)
	s.waitFormationSync()
	s.waitRectify()
	s.waitJobStart()
	assertJobs(map[string]*Job{
		"job1": {Type: "web", state: JobStateStopped, HostID: "host1"},
		"job2": {Type: "omni", state: JobStateStopped, HostID: "host1"},
		"job3": {Type: "omni", state: JobStateStarting, HostID: "host2"},
		"job4": {Type: "omni", state: JobStateStopped, HostID: "host3"},
		"job5": {Type: "omni", state: JobStateStopped, HostID: "host3"},
		"job6": {Type: "omni", state: JobStateStopped, HostID: "host2"},
		"job7": {Type: "omni", state: JobStateStopped, HostID: "host3"},
		"job8": {Type: "web", state: JobStateStarting, HostID: "host2"},
	})
	assertHostJobs(h2, "job3", "job8")
}
Example #18
0
/*
	Restore prior state from the save location defined at construction time.
	If the state save file is empty, nothing is loaded, and no error is returned.
*/
func (s *State) Restore(backend Backend, buffers host.LogBuffers) (func(), error) {
	if err := s.Acquire(); err != nil {
		return nil, err
	}
	defer s.Release()

	s.backend = backend

	var resurrect []*host.ActiveJob
	if err := s.stateDB.View(func(tx *bolt.Tx) error {
		jobsBucket := tx.Bucket([]byte("jobs"))
		backendJobsBucket := tx.Bucket([]byte("backend-jobs"))
		backendGlobalBucket := tx.Bucket([]byte("backend-global"))
		resurrectionBucket := tx.Bucket([]byte("resurrection-jobs"))

		// restore jobs
		if err := jobsBucket.ForEach(func(k, v []byte) error {
			job := &host.ActiveJob{}
			if err := json.Unmarshal(v, job); err != nil {
				return err
			}
			if job.ContainerID != "" {
				s.containers[job.ContainerID] = job
			}
			s.jobs[string(k)] = job

			return nil
		}); err != nil {
			return err
		}

		// hand opaque blobs back to backend so it can do its restore
		backendJobsBlobs := make(map[string][]byte)
		if err := backendJobsBucket.ForEach(func(k, v []byte) error {
			backendJobsBlobs[string(k)] = v
			return nil
		}); err != nil {
			return err
		}
		backendGlobalBlob := backendGlobalBucket.Get([]byte("backend"))
		if err := backend.UnmarshalState(s.jobs, backendJobsBlobs, backendGlobalBlob, buffers); err != nil {
			return err
		}

		if resurrectionBucket == nil {
			s.mtx.Lock()
			for _, job := range s.jobs {
				// if there was an unclean shutdown, we resurrect all jobs marked
				// that were running at shutdown and are no longer running.
				if job.Job.Resurrect && job.Status != host.StatusRunning {
					resurrect = append(resurrect, job)
				}
			}
			s.mtx.Unlock()
		} else {
			defer tx.DeleteBucket([]byte("resurrection-jobs"))
			if err := resurrectionBucket.ForEach(func(k, v []byte) error {
				job := &host.ActiveJob{}
				if err := json.Unmarshal(v, job); err != nil {
					return err
				}
				resurrect = append(resurrect, job)
				return nil
			}); err != nil {
				return err
			}
		}
		return nil
	}); err != nil && err != io.EOF {
		return nil, fmt.Errorf("could not restore from host persistence db: %s", err)
	}

	return func() {
		var wg sync.WaitGroup
		wg.Add(len(resurrect))
		for _, job := range resurrect {
			go func(job *host.ActiveJob) {
				// generate a new job id, this is a new job
				newID := cluster.GenerateJobID(s.id, "")
				log.Printf("resurrecting %s as %s", job.Job.ID, newID)
				job.Job.ID = newID
				config := &RunConfig{
					// TODO(titanous): Use Job instead of ActiveJob in
					// resurrection bucket once InternalIP is not used.
					// TODO(titanous): Passing the IP is a hack, remove it once the
					// postgres appliance doesn't use it to calculate its ID in the
					// state machine.
					IP: net.ParseIP(job.InternalIP),
				}
				backend.Run(job.Job, config)
				wg.Done()
			}(job)
		}
		wg.Wait()
	}, nil
}
Example #19
0
func JobConfig(f *ct.ExpandedFormation, name, hostID string, uuid string) *host.Job {
	t := f.Release.Processes[name]

	var entrypoint ct.ImageEntrypoint
	if e := GetEntrypoint(f.Artifacts, name); e != nil {
		entrypoint = *e
	}

	env := make(map[string]string, len(entrypoint.Env)+len(f.Release.Env)+len(t.Env)+5)
	for k, v := range entrypoint.Env {
		env[k] = v
	}
	for k, v := range f.Release.Env {
		env[k] = v
	}
	for k, v := range t.Env {
		env[k] = v
	}
	id := cluster.GenerateJobID(hostID, uuid)
	env["FLYNN_APP_ID"] = f.App.ID
	env["FLYNN_APP_NAME"] = f.App.Name
	env["FLYNN_RELEASE_ID"] = f.Release.ID
	env["FLYNN_PROCESS_TYPE"] = name
	env["FLYNN_JOB_ID"] = id
	metadata := make(map[string]string, len(f.App.Meta)+5)
	for k, v := range f.App.Meta {
		metadata[k] = v
	}
	metadata["flynn-controller.app"] = f.App.ID
	metadata["flynn-controller.app_name"] = f.App.Name
	metadata["flynn-controller.release"] = f.Release.ID
	metadata["flynn-controller.formation"] = "true"
	metadata["flynn-controller.type"] = name
	job := &host.Job{
		ID:       id,
		Metadata: metadata,
		Config: host.ContainerConfig{
			Args:        entrypoint.Args,
			Env:         env,
			WorkingDir:  entrypoint.WorkingDir,
			Uid:         entrypoint.Uid,
			Gid:         entrypoint.Gid,
			HostNetwork: t.HostNetwork,
		},
		Resurrect: t.Resurrect,
		Resources: t.Resources,
	}
	if len(t.Args) > 0 {
		job.Config.Args = t.Args
	}

	// job.Config.Args may be empty if restoring from an old backup which
	// still uses the deprecated Entrypoint / Cmd fields
	if len(job.Config.Args) == 0 {
		job.Config.Args = append(t.DeprecatedEntrypoint, t.DeprecatedCmd...)
	}

	SetupMountspecs(job, f.Artifacts)
	if f.App.Meta["flynn-system-app"] == "true" {
		job.Partition = "system"
	}
	job.Config.Ports = make([]host.Port, len(t.Ports))
	for i, p := range t.Ports {
		job.Config.Ports[i].Proto = p.Proto
		job.Config.Ports[i].Port = p.Port
		job.Config.Ports[i].Service = p.Service
	}
	return job
}
Example #20
0
func (TestSuite) TestMultipleHosts(c *C) {
	hosts := newTestHosts()
	host1 := hosts[testHostID]
	fakeCluster := newTestCluster(hosts)
	s := newTestScheduler(c, fakeCluster, true, nil)

	// use incremental job IDs so we can find them easily in s.jobs
	var jobID uint64
	s.generateJobUUID = func() string {
		return fmt.Sprintf("job%d", atomic.AddUint64(&jobID, 1))
	}
	s.maxHostChecks = 1

	go s.Run()
	defer s.Stop()

	// assertJobs checks that hosts have expected jobs based on their type
	// and current state
	type hostJobs map[utils.HostClient][]*Job
	assertJobs := func(expected hostJobs) {
		// get a sorted list of scheduler jobs per host to compare
		// against the expected list
		actual := make(map[string]sortJobs)
		for _, job := range s.InternalState().Jobs {
			actual[job.HostID] = append(actual[job.HostID], job)
		}
		for _, jobs := range actual {
			jobs.SortReverse()
		}

		for host, jobs := range expected {
			actual := actual[host.ID()]
			if len(actual) != len(jobs) {
				c.Fatalf("expected %s to have %d jobs, got %d", host.ID(), len(jobs), len(actual))
			}
			for i, job := range jobs {
				j := actual[i]
				c.Assert(j.Type, Equals, job.Type)
				c.Assert(j.State, Equals, job.State)

				// check the host has the job if it is running (stopped
				// jobs are removed from the host)
				if job.State != JobStateStarting {
					continue
				}
				id := cluster.GenerateJobID(host.ID(), j.ID)
				hostJob, err := host.GetJob(id)
				c.Assert(err, IsNil)
				c.Assert(hostJob.Job.ID, Equals, id)
			}
		}
	}

	c.Log("Initialize the cluster with 1 host and wait for a job to start on it.")
	s.waitJobStart()
	assertJobs(hostJobs{
		host1: {
			{Type: "web", State: JobStateStarting},
		},
	})

	c.Log("Add a host to the cluster, then create a new app, artifact, release, and associated formation.")
	host2 := NewFakeHostClient("host2", true)
	fakeCluster.AddHost(host2)
	hosts[host2.ID()] = host2
	app := &ct.App{ID: "test-app-2", Name: "test-app-2"}
	artifact := &ct.Artifact{ID: "test-artifact-2"}
	processes := map[string]int{"omni": 1}
	release := NewReleaseOmni("test-release-2", artifact, processes, true)
	c.Log("Add the formation to the controller. Wait for formation change and job start on both hosts.")
	s.CreateApp(app)
	s.CreateArtifact(artifact)
	s.CreateRelease(release)
	s.PutFormation(&ct.Formation{AppID: app.ID, ReleaseID: release.ID, Processes: processes})
	s.waitJobStart()
	s.waitJobStart()
	assertJobs(hostJobs{
		host1: {
			{Type: "web", State: JobStateStarting},
			{Type: "omni", State: JobStateStarting},
		},
		host2: {
			{Type: "omni", State: JobStateStarting},
		},
	})

	host3 := NewFakeHostClient("host3", true)
	c.Log("Add a host, wait for omni job start on that host.")
	fakeCluster.AddHost(host3)
	s.waitJobStart()
	assertJobs(hostJobs{
		host1: {
			{Type: "web", State: JobStateStarting},
			{Type: "omni", State: JobStateStarting},
		},
		host2: {
			{Type: "omni", State: JobStateStarting},
		},
		host3: {
			{Type: "omni", State: JobStateStarting},
		},
	})

	c.Log("Crash one of the omni jobs, and wait for it to restart")
	host3.CrashJob("job4")
	s.waitJobStop()
	s.waitJobStart()
	assertJobs(hostJobs{
		host1: {
			{Type: "web", State: JobStateStarting},
			{Type: "omni", State: JobStateStarting},
		},
		host2: {
			{Type: "omni", State: JobStateStarting},
		},
		host3: {
			{Type: "omni", State: JobStateStopped},
			{Type: "omni", State: JobStateStarting},
		},
	})

	c.Log("Unbalance the omni jobs, wait for them to be re-balanced")

	// pause the scheduler so we can unbalance the jobs without it trying
	// to rectify the situation
	s.Pause()

	// move host2's job to host3
	var job *host.ActiveJob
	jobs, err := host2.ListJobs()
	for _, j := range jobs {
		if j.Status == host.StatusStarting {
			job = &j
			break
		}
	}
	if job == nil {
		c.Fatal("could not find host2's omni job")
	}
	newJob := job.Job.Dup()
	newJob.ID = cluster.GenerateJobID(host3.ID(), s.generateJobUUID())
	host3.AddJob(newJob)
	err = host2.StopJob(job.Job.ID)
	c.Assert(err, IsNil)

	// resume the scheduler and check it moves the job back to host2
	s.Resume()
	s.waitJobStart()
	s.waitJobStart()
	assertJobs(hostJobs{
		host1: {
			{Type: "web", State: JobStateStarting},
			{Type: "omni", State: JobStateStarting},
		},
		host2: {
			{Type: "omni", State: JobStateStopped},
			{Type: "omni", State: JobStateStarting},
		},
		host3: {
			{Type: "omni", State: JobStateStopped},
			{Type: "omni", State: JobStateStarting},
			{Type: "omni", State: JobStateStopped},
		},
	})

	c.Logf("Remove one of the hosts. Ensure the cluster recovers correctly (hosts=%v)", hosts)
	host3.Healthy = false
	fakeCluster.SetHosts(hosts)
	s.waitFormationSync()
	s.waitRectify()
	assertJobs(hostJobs{
		host1: {
			{Type: "web", State: JobStateStarting},
			{Type: "omni", State: JobStateStarting},
		},
		host2: {
			{Type: "omni", State: JobStateStopped},
			{Type: "omni", State: JobStateStarting},
		},
		host3: {
			{Type: "omni", State: JobStateStopped},
			{Type: "omni", State: JobStateStopped},
			{Type: "omni", State: JobStateStopped},
		},
	})

	c.Logf("Remove another host. Ensure the cluster recovers correctly (hosts=%v)", hosts)
	host1.(*FakeHostClient).Healthy = false
	fakeCluster.RemoveHost(host1.ID())
	s.waitFormationSync()
	s.waitJobStart()
	assertJobs(hostJobs{
		host1: {
			{Type: "web", State: JobStateStopped},
			{Type: "omni", State: JobStateStopped},
		},
		host2: {
			{Type: "omni", State: JobStateStopped},
			{Type: "omni", State: JobStateStarting},
			{Type: "web", State: JobStateStarting},
		},
		host3: {
			{Type: "omni", State: JobStateStopped},
			{Type: "omni", State: JobStateStopped},
			{Type: "omni", State: JobStateStopped},
		},
	})
}
Example #21
0
func (c *controllerAPI) RunJob(ctx context.Context, w http.ResponseWriter, req *http.Request) {
	var newJob ct.NewJob
	if err := httphelper.DecodeJSON(req, &newJob); err != nil {
		respondWithError(w, err)
		return
	}

	if err := schema.Validate(newJob); err != nil {
		respondWithError(w, err)
		return
	}

	data, err := c.releaseRepo.Get(newJob.ReleaseID)
	if err != nil {
		respondWithError(w, err)
		return
	}
	release := data.(*ct.Release)
	if release.ImageArtifactID() == "" {
		httphelper.ValidationError(w, "release.ImageArtifact", "must be set")
		return
	}
	attach := strings.Contains(req.Header.Get("Upgrade"), "flynn-attach/0")

	hosts, err := c.clusterClient.Hosts()
	if err != nil {
		respondWithError(w, err)
		return
	}
	if len(hosts) == 0 {
		respondWithError(w, errors.New("no hosts found"))
		return
	}
	client := hosts[random.Math.Intn(len(hosts))]

	uuid := random.UUID()
	hostID := client.ID()
	id := cluster.GenerateJobID(hostID, uuid)
	app := c.getApp(ctx)
	env := make(map[string]string, len(release.Env)+len(newJob.Env)+4)
	env["FLYNN_APP_ID"] = app.ID
	env["FLYNN_RELEASE_ID"] = release.ID
	env["FLYNN_PROCESS_TYPE"] = ""
	env["FLYNN_JOB_ID"] = id
	if newJob.ReleaseEnv {
		for k, v := range release.Env {
			env[k] = v
		}
	}
	for k, v := range newJob.Env {
		env[k] = v
	}
	metadata := make(map[string]string, len(newJob.Meta)+3)
	for k, v := range newJob.Meta {
		metadata[k] = v
	}
	metadata["flynn-controller.app"] = app.ID
	metadata["flynn-controller.app_name"] = app.Name
	metadata["flynn-controller.release"] = release.ID
	job := &host.Job{
		ID:       id,
		Metadata: metadata,
		Config: host.ContainerConfig{
			Env:        env,
			TTY:        newJob.TTY,
			Stdin:      attach,
			DisableLog: newJob.DisableLog,
		},
		Resources: newJob.Resources,
	}
	resource.SetDefaults(&job.Resources)
	if len(newJob.Args) > 0 {
		job.Config.Args = newJob.Args
	}
	if len(release.ArtifactIDs) > 0 {
		artifacts, err := c.artifactRepo.ListIDs(release.ArtifactIDs...)
		if err != nil {
			respondWithError(w, err)
			return
		}
		job.ImageArtifact = artifacts[release.ImageArtifactID()].HostArtifact()
		job.FileArtifacts = make([]*host.Artifact, len(release.FileArtifactIDs()))
		for i, id := range release.FileArtifactIDs() {
			job.FileArtifacts[i] = artifacts[id].HostArtifact()
		}
	}

	// ensure slug apps use /runner/init
	if release.IsGitDeploy() && (len(job.Config.Args) == 0 || job.Config.Args[0] != "/runner/init") {
		job.Config.Args = append([]string{"/runner/init"}, job.Config.Args...)
	}

	var attachClient cluster.AttachClient
	if attach {
		attachReq := &host.AttachReq{
			JobID:  job.ID,
			Flags:  host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagStdin | host.AttachFlagStream,
			Height: uint16(newJob.Lines),
			Width:  uint16(newJob.Columns),
		}
		attachClient, err = client.Attach(attachReq, true)
		if err != nil {
			respondWithError(w, fmt.Errorf("attach failed: %s", err.Error()))
			return
		}
		defer attachClient.Close()
	}

	if err := client.AddJob(job); err != nil {
		respondWithError(w, fmt.Errorf("schedule failed: %s", err.Error()))
		return
	}

	if attach {
		// TODO(titanous): This Wait could block indefinitely if something goes
		// wrong, a context should be threaded in that cancels if the client
		// goes away.
		if err := attachClient.Wait(); err != nil {
			respondWithError(w, fmt.Errorf("attach wait failed: %s", err.Error()))
			return
		}
		w.Header().Set("Connection", "upgrade")
		w.Header().Set("Upgrade", "flynn-attach/0")
		w.WriteHeader(http.StatusSwitchingProtocols)
		conn, _, err := w.(http.Hijacker).Hijack()
		if err != nil {
			panic(err)
		}
		defer conn.Close()

		done := make(chan struct{}, 2)
		cp := func(to io.Writer, from io.Reader) {
			io.Copy(to, from)
			done <- struct{}{}
		}
		go cp(conn, attachClient.Conn())
		go cp(attachClient.Conn(), conn)

		// Wait for one of the connections to be closed or interrupted. EOF is
		// framed inside the attach protocol, so a read/write error indicates
		// that we're done and should clean up.
		<-done

		return
	} else {
		httphelper.JSON(w, 200, &ct.Job{
			ID:        job.ID,
			UUID:      uuid,
			HostID:    hostID,
			ReleaseID: newJob.ReleaseID,
			Args:      newJob.Args,
		})
	}
}
Example #22
0
func (c *Cmd) Start() error {
	if c.started {
		return errors.New("exec: already started")
	}
	c.done = make(chan struct{})
	c.started = true
	if c.host == nil && c.cluster == nil {
		var err error
		c.cluster = cluster.NewClient()
		if err != nil {
			return err
		}
		c.closeCluster = true
	}

	if c.HostID == "" {
		hosts, err := c.cluster.Hosts()
		if err != nil {
			return err
		}
		if len(hosts) == 0 {
			return errors.New("exec: no hosts found")
		}
		host := schedutil.PickHost(hosts)
		c.HostID = host.ID()
		c.host = host
	}

	// Use the pre-defined host.Job configuration if provided;
	// otherwise generate one from the fields on exec.Cmd that mirror stdlib's os.exec.
	if c.Job == nil {
		c.Job = &host.Job{
			ImageArtifact: &c.ImageArtifact,
			Config: host.ContainerConfig{
				Args:  c.Args,
				TTY:   c.TTY,
				Env:   c.Env,
				Stdin: c.Stdin != nil || c.stdinPipe != nil,
			},
			Metadata: c.Meta,
		}
		// if attaching to stdout / stderr, avoid round tripping the
		// streams via on-disk log files.
		if c.Stdout != nil || c.Stderr != nil {
			c.Job.Config.DisableLog = true
		}
	} else {
		c.Job.ImageArtifact = &c.ImageArtifact
	}
	if c.Job.ID == "" {
		c.Job.ID = cluster.GenerateJobID(c.HostID, "")
	}

	if c.host == nil {
		var err error
		c.host, err = c.cluster.Host(c.HostID)
		if err != nil {
			return err
		}
	}

	if c.Stdout != nil || c.Stderr != nil || c.Stdin != nil || c.stdinPipe != nil {
		req := &host.AttachReq{
			JobID:  c.Job.ID,
			Height: c.TermHeight,
			Width:  c.TermWidth,
			Flags:  host.AttachFlagStream,
		}
		if c.Stdout != nil {
			req.Flags |= host.AttachFlagStdout
		}
		if c.Stderr != nil {
			req.Flags |= host.AttachFlagStderr
		}
		if c.Job.Config.Stdin {
			req.Flags |= host.AttachFlagStdin
		}
		var err error
		c.attachClient, err = c.host.Attach(req, true)
		if err != nil {
			c.close()
			return err
		}
	}

	if c.stdinPipe != nil {
		c.stdinPipe.set(writeCloseCloser{c.attachClient})
	} else if c.Stdin != nil {
		go func() {
			io.Copy(c.attachClient, c.Stdin)
			c.attachClient.CloseWrite()
		}()
	}

	if c.attachClient == nil {
		c.eventChan = make(chan *host.Event)
		var err error
		c.eventStream, err = c.host.StreamEvents(c.Job.ID, c.eventChan)
		if err != nil {
			return err
		}
	}

	go func() {
		defer close(c.done)
		if c.attachClient != nil {
			c.exitStatus, c.streamErr = c.attachClient.Receive(c.Stdout, c.Stderr)
		} else {
		outer:
			for e := range c.eventChan {
				switch e.Event {
				case "stop":
					c.exitStatus = *e.Job.ExitStatus
					break outer
				case "error":
					c.streamErr = errors.New(*e.Job.Error)
					break outer
				}
			}
			c.eventStream.Close()
			if c.streamErr == nil {
				c.streamErr = c.eventStream.Err()
			}
		}
	}()

	return c.host.AddJob(c.Job)
}
Example #23
0
/*
	Restore prior state from the save location defined at construction time.
	If the state save file is empty, nothing is loaded, and no error is returned.
*/
func (s *State) Restore(backend Backend, buffers host.LogBuffers) (func(), error) {
	if err := s.Acquire(); err != nil {
		return nil, err
	}
	defer s.Release()

	s.backend = backend

	var resurrect []*host.Job
	if err := s.stateDB.View(func(tx *bolt.Tx) error {
		jobsBucket := tx.Bucket([]byte("jobs"))
		backendJobsBucket := tx.Bucket([]byte("backend-jobs"))
		backendGlobalBucket := tx.Bucket([]byte("backend-global"))
		persistentBucket := tx.Bucket([]byte("persistent-jobs"))

		// restore jobs
		if err := jobsBucket.ForEach(func(k, v []byte) error {
			job := &host.ActiveJob{}
			if err := json.Unmarshal(v, job); err != nil {
				return err
			}
			if job.ContainerID != "" {
				s.containers[job.ContainerID] = job
			}
			s.jobs[string(k)] = job

			return nil
		}); err != nil {
			return err
		}

		// hand opaque blobs back to backend so it can do its restore
		backendJobsBlobs := make(map[string][]byte)
		if err := backendJobsBucket.ForEach(func(k, v []byte) error {
			backendJobsBlobs[string(k)] = v
			return nil
		}); err != nil {
			return err
		}
		backendGlobalBlob := backendGlobalBucket.Get([]byte("backend"))
		if err := backend.UnmarshalState(s.jobs, backendJobsBlobs, backendGlobalBlob, buffers); err != nil {
			return err
		}

		s.mtx.Lock()
		if err := persistentBucket.ForEach(func(k, v []byte) error {
			for _, job := range s.jobs {
				if job.Job.ID == string(v) {
					resurrect = append(resurrect, job.Job)
				}
			}
			return nil
		}); err != nil {
			return err
		}
		s.mtx.Unlock()
		return nil
	}); err != nil && err != io.EOF {
		return nil, fmt.Errorf("could not restore from host persistence db: %s", err)
	}

	return func() {
		var wg sync.WaitGroup
		wg.Add(len(resurrect))
		for _, job := range resurrect {
			go func(job *host.Job) {
				// generate a new job id, this is a new job
				newJob := job.Dup()
				newJob.ID = cluster.GenerateJobID(s.id, "")
				log.Printf("resurrecting %s as %s", job.ID, newJob.ID)
				s.AddJob(newJob)
				backend.Run(newJob, nil)
				wg.Done()
			}(job)
		}
		wg.Wait()
	}, nil
}