Example #1
0
/*
	Make an 'ish' application on the given host, returning it when
	it has registered readiness with discoverd.

	User will want to defer cmd.Kill() to clean up.
*/
func makeIshApp(cluster *cluster.Client, h *cluster.Host, dc *discoverd.Client, extraConfig host.ContainerConfig) (*exec.Cmd, *discoverd.Instance, error) {
	// pick a unique string to use as service name so this works with concurrent tests.
	serviceName := "ish-service-" + random.String(6)

	// run a job that accepts tcp connections and performs tasks we ask of it in its container
	cmd := exec.JobUsingCluster(cluster, exec.DockerImage(imageURIs["test-apps"]), &host.Job{
		Config: host.ContainerConfig{
			Args:  []string{"/bin/ish"},
			Ports: []host.Port{{Proto: "tcp"}},
			Env: map[string]string{
				"NAME": serviceName,
			},
		}.Merge(extraConfig),
	})
	cmd.HostID = h.ID()
	if err := cmd.Start(); err != nil {
		return nil, nil, err
	}

	// wait for the job to heartbeat and return its address
	services, err := dc.Instances(serviceName, time.Second*100)
	if err != nil {
		cmd.Kill()
		return nil, nil, err
	}
	if len(services) != 1 {
		cmd.Kill()
		return nil, nil, fmt.Errorf("test setup: expected exactly one service instance, got %d", len(services))
	}

	return cmd, services[0], nil
}
Example #2
0
File: ps.go Project: snormore/flynn
func runPs(args *docopt.Args, client cluster.Host) error {
	all, err := client.ListJobs()
	if err != nil {
		return fmt.Errorf("could not get local jobs: %s", err)
	}

	jobs := make(sortJobs, 0, len(all))
	for _, job := range all {
		if !args.Bool["-a"] && !args.Bool["--all"] && job.Status != host.StatusStarting && job.Status != host.StatusRunning {
			continue
		}
		jobs = append(jobs, job)
	}
	sort.Sort(sort.Reverse(jobs))

	if args.Bool["-q"] || args.Bool["--quiet"] {
		for _, job := range jobs {
			fmt.Println(job.Job.ID)
		}
		return nil
	}
	w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0)
	defer w.Flush()
	fmt.Fprintln(w, "JOB ID\tSTATE\tSTARTED\tCONTROLLER APP\tCONTROLLER TYPE")
	for _, job := range jobs {
		fmt.Fprintf(w, "%s\t%s\t%s ago\t%s\t%s\n", job.Job.ID, job.Status, units.HumanDuration(time.Now().UTC().Sub(job.StartedAt)), job.Job.Metadata["flynn-controller.app_name"], job.Job.Metadata["flynn-controller.type"])
	}
	return nil
}
Example #3
0
func (s *HostSuite) TestUpdateTags(t *c.C) {
	events := make(chan *discoverd.Event)
	stream, err := s.discoverdClient(t).Service("flynn-host").Watch(events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	nextEvent := func() *discoverd.Event {
		select {
		case e, ok := <-events:
			if !ok {
				t.Fatal("unexpected close of discoverd stream")
			}
			return e
		case <-time.After(10 * time.Second):
			t.Fatal("timed out waiting for discoverd event")
		}
		return nil
	}

	var client *cluster.Host
	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUp && client == nil {
			client = cluster.NewHost(e.Instance.Meta["id"], e.Instance.Addr, nil, nil)
		}
		if e.Kind == discoverd.EventKindCurrent {
			break
		}
	}
	if client == nil {
		t.Fatal("did not initialize flynn-host client")
	}

	t.Assert(client.UpdateTags(map[string]string{"foo": "bar"}), c.IsNil)

	var meta map[string]string
	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() {
			meta = e.Instance.Meta
			break
		}
	}
	t.Assert(meta["tag:foo"], c.Equals, "bar")

	// setting to empty string should delete the tag
	t.Assert(client.UpdateTags(map[string]string{"foo": ""}), c.IsNil)

	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() {
			meta = e.Instance.Meta
			break
		}
	}
	if _, ok := meta["tag:foo"]; ok {
		t.Fatal("expected tag to be deleted but is still present")
	}
}
Example #4
0
func jobLog(req *http.Request, app *ct.App, params martini.Params, hc cluster.Host, w http.ResponseWriter, r ResponseHelper) {
	attachReq := &host.AttachReq{
		JobID: params["jobs_id"],
		Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagLogs,
	}
	tail := req.FormValue("tail") != ""
	if tail {
		attachReq.Flags |= host.AttachFlagStream
	}
	wait := req.FormValue("wait") != ""
	attachClient, err := hc.Attach(attachReq, wait)
	if err != nil {
		if err == cluster.ErrWouldWait {
			w.WriteHeader(404)
		} else {
			r.Error(err)
		}
		return
	}

	if cn, ok := w.(http.CloseNotifier); ok {
		go func() {
			<-cn.CloseNotify()
			attachClient.Close()
		}()
	} else {
		defer attachClient.Close()
	}

	sse := strings.Contains(req.Header.Get("Accept"), "text/event-stream")
	if sse {
		w.Header().Set("Content-Type", "text/event-stream; charset=utf-8")
	} else {
		w.Header().Set("Content-Type", "application/vnd.flynn.attach")
	}
	w.WriteHeader(200)
	// Send headers right away if tailing
	if wf, ok := w.(http.Flusher); ok && tail {
		wf.Flush()
	}

	fw := flushWriter{w, tail}
	if sse {
		ssew := NewSSELogWriter(w)
		exit, err := attachClient.Receive(flushWriter{ssew.Stream("stdout"), tail}, flushWriter{ssew.Stream("stderr"), tail})
		if err != nil {
			fw.Write([]byte("event: error\ndata: {}\n\n"))
			return
		}
		if tail {
			fmt.Fprintf(fw, "event: exit\ndata: {\"status\": %d}\n\n", exit)
			return
		}
		fw.Write([]byte("event: eof\ndata: {}\n\n"))
	} else {
		io.Copy(fw, attachClient.Conn())
	}
}
Example #5
0
func ProvisionVolume(h *cluster.Host, job *host.Job) error {
	vol, err := h.CreateVolume("default")
	if err != nil {
		return err
	}
	job.Config.Volumes = []host.VolumeBinding{{
		Target:    "/data",
		VolumeID:  vol.ID,
		Writeable: true,
	}}
	return nil
}
Example #6
0
func hostRaftStatus(host *cluster.Host, peers []string, leader string) (raftStatus string) {
	raftStatus = "proxy"
	ip, _, _ := net.SplitHostPort(host.Addr())
	for _, addr := range peers {
		discIp := ip + ":1111"
		if addr == discIp {
			raftStatus = "peer"
			if leader == discIp {
				raftStatus = raftStatus + " (leader)"
			}
			break
		}
	}
	return
}
Example #7
0
func runStop(args *docopt.Args, client cluster.Host) error {
	success := true
	for _, id := range args.All["ID"].([]string) {
		if err := client.StopJob(id); err != nil {
			fmt.Printf("could not stop job %s: %s\n", id, err)
			success = false
			continue
		}
		fmt.Println(id, "stopped")
	}
	if !success {
		return errors.New("could not stop all jobs")
	}
	return nil
}
Example #8
0
func jobLog(req *http.Request, app *ct.App, params martini.Params, hc cluster.Host, w http.ResponseWriter, r ResponseHelper) {
	attachReq := &host.AttachReq{
		JobID: params["jobs_id"],
		Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagLogs,
	}
	tail := req.FormValue("tail") != ""
	if tail {
		attachReq.Flags |= host.AttachFlagStream
	}
	wait := req.FormValue("wait") != ""
	attachClient, err := hc.Attach(attachReq, wait)
	if err != nil {
		if err == cluster.ErrWouldWait {
			w.WriteHeader(404)
		} else {
			r.Error(err)
		}
		return
	}
	defer attachClient.Close()

	sse := strings.Contains(req.Header.Get("Accept"), "text/event-stream")
	if sse {
		w.Header().Set("Content-Type", "text/event-stream; charset=utf-8")
	} else {
		w.Header().Set("Content-Type", "application/vnd.flynn.attach")
	}
	w.WriteHeader(200)
	// Send headers right away if tailing
	if wf, ok := w.(http.Flusher); ok && tail {
		wf.Flush()
	}

	// TODO: use http.CloseNotifier to clean up when client disconnects

	if sse {
		ssew := NewSSELogWriter(w)
		attachClient.Receive(flushWriter{ssew.Stream("stdout"), tail}, flushWriter{ssew.Stream("stderr"), tail})
		// TODO: include exit code here if tailing
		flushWriter{w, tail}.Write([]byte("event: eof\ndata: {}\n\n"))
	} else {
		io.Copy(flushWriter{w, tail}, attachClient.Conn())
	}
}
Example #9
0
func runLog(args *docopt.Args, client cluster.Host) error {
	attachReq := &host.AttachReq{
		JobID: args.String["ID"],
		Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagLogs,
	}
	if args.Bool["-f"] || args.Bool["--follow"] {
		attachReq.Flags |= host.AttachFlagStream
	}
	attachClient, err := client.Attach(attachReq, false)
	if err != nil {
		if err == cluster.ErrWouldWait {
			return fmt.Errorf("no such job")
		}
		return err
	}
	defer attachClient.Close()
	attachClient.Receive(os.Stdout, os.Stderr)
	return nil
}
Example #10
0
func startJob(s *State, hc *cluster.Host, job *host.Job) (*Job, error) {
	data := &Job{HostID: hc.ID(), JobID: job.ID}

	jobStatus := make(chan error)
	events := make(chan *host.Event)
	stream, err := hc.StreamEvents(data.JobID, events)
	if err != nil {
		return nil, err
	}
	go func() {
		defer stream.Close()
		for e := range events {
			switch e.Event {
			case "start", "stop":
				jobStatus <- nil
				return
			case "error":
				job, err := hc.GetJob(data.JobID)
				if err != nil {
					jobStatus <- err
					return
				}
				if job.Error == nil {
					jobStatus <- fmt.Errorf("bootstrap: unknown error from host")
					return
				}
				jobStatus <- fmt.Errorf("bootstrap: host error while launching job: %q", *job.Error)
				return
			default:
			}
		}
		jobStatus <- fmt.Errorf("bootstrap: host job stream disconnected unexpectedly: %q", stream.Err())
	}()

	if err := hc.AddJob(job); err != nil {
		return nil, err
	}

	return data, <-jobStatus
}
Example #11
0
func startJob(s *State, hc *cluster.Host, job *host.Job) error {
	jobStatus := make(chan error)
	events := make(chan *host.Event)
	stream, err := hc.StreamEvents(job.ID, events)
	if err != nil {
		return err
	}
	go func() {
		defer stream.Close()
	loop:
		for {
			select {
			case e, ok := <-events:
				if !ok {
					break loop
				}
				switch e.Event {
				case "start", "stop":
					jobStatus <- nil
					return
				case "error":
					job, err := hc.GetJob(job.ID)
					if err != nil {
						jobStatus <- err
						return
					}
					if job.Error == nil {
						jobStatus <- fmt.Errorf("bootstrap: unknown error from host")
						return
					}
					jobStatus <- fmt.Errorf("bootstrap: host error while launching job: %q", *job.Error)
					return
				default:
				}
			case <-time.After(30 * time.Second):
				jobStatus <- errors.New("bootstrap: timed out waiting for job event")
				return
			}
		}
		jobStatus <- fmt.Errorf("bootstrap: host job stream disconnected unexpectedly: %q", stream.Err())
	}()

	if err := hc.AddJob(job); err != nil {
		return err
	}

	return <-jobStatus
}
Example #12
0
func JobUsingHost(h *cluster.Host, artifact host.Artifact, job *host.Job) *Cmd {
	command := Job(artifact, job)
	command.HostID = h.ID()
	command.host = h
	return command
}
Example #13
0
func (s *VolumeSuite) doVolumeTransmitAPI(h0, h1 *cluster.Host, t *c.C) {
	clus := s.clusterClient(t)

	// create a volume!
	vol, err := h0.CreateVolume("default")
	t.Assert(err, c.IsNil)
	defer func() {
		t.Assert(h0.DestroyVolume(vol.ID), c.IsNil)
	}()
	// create a job and use it to add data to the volume
	cmd, service, err := makeIshApp(clus, h0, s.discoverdClient(t), host.ContainerConfig{
		Volumes: []host.VolumeBinding{{
			Target:    "/vol",
			VolumeID:  vol.ID,
			Writeable: true,
		}},
	})
	t.Assert(err, c.IsNil)
	defer cmd.Kill()
	resp, err := runIshCommand(service, "echo 'testcontent' > /vol/alpha ; echo $?")
	t.Assert(err, c.IsNil)
	t.Assert(resp, c.Equals, "0\n")

	// take a snapshot
	snapInfo, err := h0.CreateSnapshot(vol.ID)
	t.Assert(err, c.IsNil)
	defer func() {
		t.Assert(h0.DestroyVolume(snapInfo.ID), c.IsNil)
	}()
	// make a volume on another host to yank the snapshot content into
	vol2, err := h1.CreateVolume("default")
	t.Assert(err, c.IsNil)
	defer func() {
		t.Assert(h1.DestroyVolume(vol2.ID), c.IsNil)
	}()
	// transfer the snapshot to the new volume on the other host
	snapInfo2, err := h1.PullSnapshot(vol2.ID, h0.ID(), snapInfo.ID)
	t.Assert(err, c.IsNil)
	defer func() {
		t.Assert(h1.DestroyVolume(snapInfo2.ID), c.IsNil)
	}()

	// start a job on the other host that mounts and inspects the transmitted volume
	cmd, service, err = makeIshApp(clus, h1, s.discoverdClient(t), host.ContainerConfig{
		Volumes: []host.VolumeBinding{{
			Target:    "/vol",
			VolumeID:  vol2.ID,
			Writeable: false,
		}},
	})
	t.Assert(err, c.IsNil)
	defer cmd.Kill()
	// read data back from the volume
	resp, err = runIshCommand(service, "cat /vol/alpha")
	t.Assert(err, c.IsNil)
	t.Assert(resp, c.Equals, "testcontent\n")
}
Example #14
0
File: main.go Project: kgrz/flynn
func (c *context) watchHost(h *cluster.Host, ready chan struct{}) {
	if !c.hosts.Add(h.ID()) {
		if ready != nil {
			ready <- struct{}{}
		}
		return
	}
	defer c.hosts.Remove(h.ID())

	g := grohl.NewContext(grohl.Data{"fn": "watchHost", "host.id": h.ID()})

	c.hosts.Set(h.ID(), h)

	g.Log(grohl.Data{"at": "start"})

	ch := make(chan *host.Event)
	h.StreamEvents("all", ch)
	if ready != nil {
		ready <- struct{}{}
	}

	// Call PutJob in a goroutine so we don't block receiving job events whilst potentially
	// making multiple requests to the controller (e.g. if the controller is down).
	//
	// Use a channel (rather than spawning a goroutine per event) so that events are delivered in order.
	jobs := make(chan *ct.Job, 10)
	go func() {
		for job := range jobs {
			putJobAttempts.Run(func() error {
				if err := c.PutJob(job); err != nil {
					g.Log(grohl.Data{"at": "put_job_error", "job.id": job.ID, "state": job.State, "err": err})
					// ignore validation / not found errors
					if httphelper.IsValidationError(err) || err == controller.ErrNotFound {
						return nil
					}
					return err
				}
				g.Log(grohl.Data{"at": "put_job", "job.id": job.ID, "state": job.State})
				return nil
			})
		}
	}()

	for event := range ch {
		meta := event.Job.Job.Metadata
		appID := meta["flynn-controller.app"]
		releaseID := meta["flynn-controller.release"]
		jobType := meta["flynn-controller.type"]

		if appID == "" || releaseID == "" {
			continue
		}

		job := &ct.Job{
			ID:        event.JobID,
			AppID:     appID,
			ReleaseID: releaseID,
			Type:      jobType,
			State:     jobState(event),
			Meta:      jobMetaFromMetadata(meta),
		}
		g.Log(grohl.Data{"at": "event", "job.id": event.JobID, "event": event.Event})
		jobs <- job

		// get a read lock on the mutex to ensure we are not currently
		// syncing with the cluster
		c.mtx.RLock()
		j := c.jobs.Get(h.ID(), event.JobID)
		c.mtx.RUnlock()
		if j == nil {
			continue
		}
		j.startedAt = event.Job.StartedAt

		if event.Event != "error" && event.Event != "stop" {
			continue
		}
		g.Log(grohl.Data{"at": "remove", "job.id": event.JobID, "event": event.Event})

		c.jobs.Remove(h.ID(), event.JobID)
		go func(event *host.Event) {
			c.mtx.RLock()
			j.Formation.RestartJob(jobType, h.ID(), event.JobID)
			c.mtx.RUnlock()
		}(event)
	}
	// TODO: check error/reconnect
}
Example #15
0
func (f *ClusterFixer) FixPostgres() error {
	f.l.Info("checking postgres")
	service := discoverd.NewService("postgres")
	leader, _ := service.Leader()
	if leader == nil || leader.Addr == "" {
		f.l.Info("no running postgres leader")
		leader = nil
	} else {
		f.l.Info("found running postgres leader")
	}
	instances, _ := service.Instances()
	f.l.Info(fmt.Sprintf("found %d running postgres instances", len(instances)))

	f.l.Info("getting postgres status")
	var status *pgmanager.Status
	if leader != nil && leader.Addr != "" {
		client := pgmanager.NewClient(leader.Addr)
		var err error
		status, err = client.Status()
		if err != nil {
			f.l.Error("error getting status from postgres leader", "error", err)
		}
	}
	if status != nil && status.Postgres.ReadWrite {
		f.l.Info("postgres claims to be read-write")
		return nil
	}

	f.l.Info("getting postgres service metadata")
	meta, err := discoverd.NewService("postgres").GetMeta()
	if err != nil {
		return fmt.Errorf("error getting postgres state from discoverd: %s", err)
	}

	var state pgstate.State
	if err := json.Unmarshal(meta.Data, &state); err != nil {
		return fmt.Errorf("error decoding postgres state: %s", err)
	}
	if state.Primary == nil {
		return fmt.Errorf("no primary in postgres state")
	}

	f.l.Info("getting postgres primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"])
	job, host, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"])
	if err != nil {
		if state.Sync != nil {
			f.l.Error("unable to get primary job info", "error", err)
			f.l.Info("getting postgres sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"])
			job, host, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"])
			if err != nil {
				return fmt.Errorf("unable to get postgres primary or sync job details: %s", err)
			}
		} else {
			return fmt.Errorf("unable to get postgres primary job details: %s", err)
		}
	}

	if leader != nil && state.Singleton {
		return fmt.Errorf("postgres leader is running in singleton mode, unable to fix")
	}

	waitForInstance := func(jobID string) (func() (string, error), error) {
		watchCh := make(chan *discoverd.Event)
		upCh := make(chan string)
		stream, err := service.Watch(watchCh)
		if err != nil {
			return nil, fmt.Errorf("error watching discoverd service: %s", err)
		}
		go func() {
			var current bool
			for event := range watchCh {
				if event.Kind == discoverd.EventKindCurrent {
					current = true
					continue
				}
				if !current || event.Kind != discoverd.EventKindUp {
					continue
				}
				if event.Instance.Meta["FLYNN_JOB_ID"] == jobID {
					upCh <- event.Instance.Addr
				}
			}
		}()
		return func() (string, error) {
			f.l.Info("waiting for postgres instance to start", "job.id", jobID)
			defer stream.Close()
			select {
			case addr := <-upCh:
				return addr, nil
			case <-time.After(time.Minute):
				return "", fmt.Errorf("timed out waiting for postgres instance to come up")
			}
		}, nil
	}

	var wait func() (string, error)
	have := len(instances)
	want := 2
	if state.Singleton {
		want = 1
	}
	if have >= want {
		return fmt.Errorf("already have enough postgres instances, unable to fix")
	}
	f.l.Info("attempting to start missing postgres jobs", "want", want, "have", have)
	if leader == nil {
		// if no postgres, attempt to start
		job.ID = cluster.GenerateJobID(host.ID(), "")
		f.FixJobEnv(job)
		f.l.Info("starting postgres primary job", "job.id", job.ID)
		wait, err = waitForInstance(job.ID)
		if err != nil {
			return err
		}
		if err := host.AddJob(job); err != nil {
			return fmt.Errorf("error starting postgres primary job on %s: %s", host.ID(), err)
		}
		have++
	}
	if want > have {
		// if not enough postgres instances, start another
		var secondHost *cluster.Host
		for _, h := range f.hosts {
			if h.ID() != host.ID() {
				secondHost = h
				break
			}
		}
		if secondHost == nil {
			// if there are no other hosts, use the same one we put the primary on
			secondHost = host
		}
		job.ID = cluster.GenerateJobID(secondHost.ID(), "")
		f.FixJobEnv(job)
		f.l.Info("starting second postgres job", "job.id", job.ID)
		if wait == nil {
			wait, err = waitForInstance(job.ID)
			if err != nil {
				return err
			}
		}
		if err := utils.ProvisionVolume(secondHost, job); err != nil {
			return fmt.Errorf("error creating postgres volume on %s: %s", secondHost.ID(), err)
		}
		if err := secondHost.AddJob(job); err != nil {
			return fmt.Errorf("error starting additional postgres job on %s: %s", secondHost.ID(), err)
		}
	}

	if wait != nil {
		addr, err := wait()
		if err != nil {
			return err
		}
		if leader != nil {
			addr = leader.Addr
		}
		f.l.Info("waiting for postgres to come up read-write")
		return pgmanager.NewClient(addr).WaitForReadWrite(5 * time.Minute)
	}
	return nil
}
Example #16
0
func killJob(app *ct.App, params martini.Params, client cluster.Host, r ResponseHelper) {
	if err := client.StopJob(params["jobs_id"]); err != nil {
		r.Error(err)
		return
	}
}
Example #17
0
func (f *ClusterFixer) FixSirenia(svc string) error {
	log := f.l.New("fn", "FixSirenia", "service", svc)

	service := discoverd.NewService(svc)
	instances, _ := service.Instances()
	leader, _ := service.Leader()

	log.Info("getting service metadata")
	meta, err := service.GetMeta()
	if err != nil {
		return fmt.Errorf("error getting sirenia state from discoverd: %s", err)
	}

	var state state.State
	if err := json.Unmarshal(meta.Data, &state); err != nil {
		return fmt.Errorf("error decoding state: %s", err)
	}
	if state.Primary == nil {
		return fmt.Errorf("no primary in sirenia state")
	}

	log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"])
	primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"])
	if err != nil {
		log.Error("unable to get primary job info")
	}
	var syncJob *host.Job
	var syncHost *cluster.Host
	if state.Sync != nil {
		log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"])
		syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"])
		if err != nil {
			log.Error("unable to get sync job info")
		}
	}

	waitForInstance := func(jobID string) (func() (string, error), error) {
		watchCh := make(chan *discoverd.Event)
		upCh := make(chan string)
		stream, err := service.Watch(watchCh)
		if err != nil {
			return nil, fmt.Errorf("error watching discoverd service: %s", err)
		}
		go func() {
			var current bool
			for event := range watchCh {
				if event.Kind == discoverd.EventKindCurrent {
					current = true
					continue
				}
				if !current || event.Kind != discoverd.EventKindUp {
					continue
				}
				if event.Instance.Meta["FLYNN_JOB_ID"] == jobID {
					upCh <- event.Instance.Addr
				}
			}
		}()
		return func() (string, error) {
			log.Info("waiting for instance to start", "job.id", jobID)
			defer stream.Close()
			select {
			case addr := <-upCh:
				return addr, nil
			case <-time.After(time.Minute):
				return "", fmt.Errorf("timed out waiting for sirenia instance to come up")
			}
		}, nil
	}

	log.Info("terminating unassigned sirenia instances")
outer:
	for _, i := range instances {
		if i.Addr == state.Primary.Addr || (state.Sync != nil && i.Addr == state.Sync.Addr) {
			continue
		}
		for _, a := range state.Async {
			if i.Addr == a.Addr {
				continue outer
			}
		}
		// job not assigned in state, attempt to terminate it
		if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok {
			hostID, err := cluster.ExtractHostID(jobID)
			if err != nil {
				log.Error("error extracting host id from jobID", "jobID", jobID, "err", err)
			}
			h := f.Host(hostID)
			if h != nil {
				if err := h.StopJob(jobID); err != nil {
					log.Error("error stopping unassigned sirenia job", "jobID", jobID)
				}
			} else {
				log.Error("host not found", "hostID", hostID)
			}
		}
	}

	isRunning := func(addr string) bool {
		for _, i := range instances {
			if i.Addr == addr {
				return true
			}
		}
		return false
	}

	// if the leader isn't currently running then start it using primaryJob/primaryHost
	var wait func() (string, error)
	if !isRunning(state.Primary.Addr) {
		// if we don't have info about the primary job attempt to promote the sync
		if primaryJob == nil {
			if syncJob != nil {
				// set primary job to sync
				primaryJob = syncJob
				primaryHost = syncHost

				// nil out sync job now so we can re-allocate it.
				syncJob = nil
				syncHost = nil
			} else {
				return fmt.Errorf("neither primary or sync job info available")
			}
		}

		primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "")
		f.FixJobEnv(primaryJob)
		log.Info("starting primary job", "job.id", primaryJob.ID)
		wait, err = waitForInstance(primaryJob.ID)
		if err != nil {
			return err
		}
		if err := primaryHost.AddJob(primaryJob); err != nil {
			return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err)
		}
	}
	if !state.Singleton && !isRunning(state.Sync.Addr) {
		if syncHost == nil {
			for _, h := range f.hosts {
				if h.ID() != primaryHost.ID() {
					syncHost = h
					break
				}
			}
			if syncHost == nil {
				// if there are no other hosts, use the same one we put the primary on
				syncHost = primaryHost
			}
		}
		// if we don't have a sync job then copy the primary job
		// and provision a new volume
		if syncJob == nil {
			syncJob = primaryJob
			vol := &ct.VolumeReq{Path: "/data"}
			if _, err := utils.ProvisionVolume(vol, syncHost, syncJob); err != nil {
				return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err)
			}
		}
		syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "")
		f.FixJobEnv(syncJob)
		log.Info("starting sync job", "job.id", syncJob.ID)
		if wait == nil {
			wait, err = waitForInstance(syncJob.ID)
			if err != nil {
				return err
			}
		}
		if err := syncHost.AddJob(syncJob); err != nil {
			return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err)
		}
	}

	if wait != nil {
		addr, err := wait()
		if err != nil {
			return err
		}
		if leader != nil && leader.Addr != "" {
			addr = leader.Addr
		}
		log.Info("waiting for cluster to come up read-write", "addr", addr)
		return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute)
	}
	return nil
}