/* Make an 'ish' application on the given host, returning it when it has registered readiness with discoverd. User will want to defer cmd.Kill() to clean up. */ func makeIshApp(cluster *cluster.Client, h *cluster.Host, dc *discoverd.Client, extraConfig host.ContainerConfig) (*exec.Cmd, *discoverd.Instance, error) { // pick a unique string to use as service name so this works with concurrent tests. serviceName := "ish-service-" + random.String(6) // run a job that accepts tcp connections and performs tasks we ask of it in its container cmd := exec.JobUsingCluster(cluster, exec.DockerImage(imageURIs["test-apps"]), &host.Job{ Config: host.ContainerConfig{ Args: []string{"/bin/ish"}, Ports: []host.Port{{Proto: "tcp"}}, Env: map[string]string{ "NAME": serviceName, }, }.Merge(extraConfig), }) cmd.HostID = h.ID() if err := cmd.Start(); err != nil { return nil, nil, err } // wait for the job to heartbeat and return its address services, err := dc.Instances(serviceName, time.Second*100) if err != nil { cmd.Kill() return nil, nil, err } if len(services) != 1 { cmd.Kill() return nil, nil, fmt.Errorf("test setup: expected exactly one service instance, got %d", len(services)) } return cmd, services[0], nil }
func runPs(args *docopt.Args, client cluster.Host) error { all, err := client.ListJobs() if err != nil { return fmt.Errorf("could not get local jobs: %s", err) } jobs := make(sortJobs, 0, len(all)) for _, job := range all { if !args.Bool["-a"] && !args.Bool["--all"] && job.Status != host.StatusStarting && job.Status != host.StatusRunning { continue } jobs = append(jobs, job) } sort.Sort(sort.Reverse(jobs)) if args.Bool["-q"] || args.Bool["--quiet"] { for _, job := range jobs { fmt.Println(job.Job.ID) } return nil } w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0) defer w.Flush() fmt.Fprintln(w, "JOB ID\tSTATE\tSTARTED\tCONTROLLER APP\tCONTROLLER TYPE") for _, job := range jobs { fmt.Fprintf(w, "%s\t%s\t%s ago\t%s\t%s\n", job.Job.ID, job.Status, units.HumanDuration(time.Now().UTC().Sub(job.StartedAt)), job.Job.Metadata["flynn-controller.app_name"], job.Job.Metadata["flynn-controller.type"]) } return nil }
func (s *HostSuite) TestUpdateTags(t *c.C) { events := make(chan *discoverd.Event) stream, err := s.discoverdClient(t).Service("flynn-host").Watch(events) t.Assert(err, c.IsNil) defer stream.Close() nextEvent := func() *discoverd.Event { select { case e, ok := <-events: if !ok { t.Fatal("unexpected close of discoverd stream") } return e case <-time.After(10 * time.Second): t.Fatal("timed out waiting for discoverd event") } return nil } var client *cluster.Host for { e := nextEvent() if e.Kind == discoverd.EventKindUp && client == nil { client = cluster.NewHost(e.Instance.Meta["id"], e.Instance.Addr, nil, nil) } if e.Kind == discoverd.EventKindCurrent { break } } if client == nil { t.Fatal("did not initialize flynn-host client") } t.Assert(client.UpdateTags(map[string]string{"foo": "bar"}), c.IsNil) var meta map[string]string for { e := nextEvent() if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() { meta = e.Instance.Meta break } } t.Assert(meta["tag:foo"], c.Equals, "bar") // setting to empty string should delete the tag t.Assert(client.UpdateTags(map[string]string{"foo": ""}), c.IsNil) for { e := nextEvent() if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() { meta = e.Instance.Meta break } } if _, ok := meta["tag:foo"]; ok { t.Fatal("expected tag to be deleted but is still present") } }
func jobLog(req *http.Request, app *ct.App, params martini.Params, hc cluster.Host, w http.ResponseWriter, r ResponseHelper) { attachReq := &host.AttachReq{ JobID: params["jobs_id"], Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagLogs, } tail := req.FormValue("tail") != "" if tail { attachReq.Flags |= host.AttachFlagStream } wait := req.FormValue("wait") != "" attachClient, err := hc.Attach(attachReq, wait) if err != nil { if err == cluster.ErrWouldWait { w.WriteHeader(404) } else { r.Error(err) } return } if cn, ok := w.(http.CloseNotifier); ok { go func() { <-cn.CloseNotify() attachClient.Close() }() } else { defer attachClient.Close() } sse := strings.Contains(req.Header.Get("Accept"), "text/event-stream") if sse { w.Header().Set("Content-Type", "text/event-stream; charset=utf-8") } else { w.Header().Set("Content-Type", "application/vnd.flynn.attach") } w.WriteHeader(200) // Send headers right away if tailing if wf, ok := w.(http.Flusher); ok && tail { wf.Flush() } fw := flushWriter{w, tail} if sse { ssew := NewSSELogWriter(w) exit, err := attachClient.Receive(flushWriter{ssew.Stream("stdout"), tail}, flushWriter{ssew.Stream("stderr"), tail}) if err != nil { fw.Write([]byte("event: error\ndata: {}\n\n")) return } if tail { fmt.Fprintf(fw, "event: exit\ndata: {\"status\": %d}\n\n", exit) return } fw.Write([]byte("event: eof\ndata: {}\n\n")) } else { io.Copy(fw, attachClient.Conn()) } }
func ProvisionVolume(h *cluster.Host, job *host.Job) error { vol, err := h.CreateVolume("default") if err != nil { return err } job.Config.Volumes = []host.VolumeBinding{{ Target: "/data", VolumeID: vol.ID, Writeable: true, }} return nil }
func hostRaftStatus(host *cluster.Host, peers []string, leader string) (raftStatus string) { raftStatus = "proxy" ip, _, _ := net.SplitHostPort(host.Addr()) for _, addr := range peers { discIp := ip + ":1111" if addr == discIp { raftStatus = "peer" if leader == discIp { raftStatus = raftStatus + " (leader)" } break } } return }
func runStop(args *docopt.Args, client cluster.Host) error { success := true for _, id := range args.All["ID"].([]string) { if err := client.StopJob(id); err != nil { fmt.Printf("could not stop job %s: %s\n", id, err) success = false continue } fmt.Println(id, "stopped") } if !success { return errors.New("could not stop all jobs") } return nil }
func jobLog(req *http.Request, app *ct.App, params martini.Params, hc cluster.Host, w http.ResponseWriter, r ResponseHelper) { attachReq := &host.AttachReq{ JobID: params["jobs_id"], Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagLogs, } tail := req.FormValue("tail") != "" if tail { attachReq.Flags |= host.AttachFlagStream } wait := req.FormValue("wait") != "" attachClient, err := hc.Attach(attachReq, wait) if err != nil { if err == cluster.ErrWouldWait { w.WriteHeader(404) } else { r.Error(err) } return } defer attachClient.Close() sse := strings.Contains(req.Header.Get("Accept"), "text/event-stream") if sse { w.Header().Set("Content-Type", "text/event-stream; charset=utf-8") } else { w.Header().Set("Content-Type", "application/vnd.flynn.attach") } w.WriteHeader(200) // Send headers right away if tailing if wf, ok := w.(http.Flusher); ok && tail { wf.Flush() } // TODO: use http.CloseNotifier to clean up when client disconnects if sse { ssew := NewSSELogWriter(w) attachClient.Receive(flushWriter{ssew.Stream("stdout"), tail}, flushWriter{ssew.Stream("stderr"), tail}) // TODO: include exit code here if tailing flushWriter{w, tail}.Write([]byte("event: eof\ndata: {}\n\n")) } else { io.Copy(flushWriter{w, tail}, attachClient.Conn()) } }
func runLog(args *docopt.Args, client cluster.Host) error { attachReq := &host.AttachReq{ JobID: args.String["ID"], Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagLogs, } if args.Bool["-f"] || args.Bool["--follow"] { attachReq.Flags |= host.AttachFlagStream } attachClient, err := client.Attach(attachReq, false) if err != nil { if err == cluster.ErrWouldWait { return fmt.Errorf("no such job") } return err } defer attachClient.Close() attachClient.Receive(os.Stdout, os.Stderr) return nil }
func startJob(s *State, hc *cluster.Host, job *host.Job) (*Job, error) { data := &Job{HostID: hc.ID(), JobID: job.ID} jobStatus := make(chan error) events := make(chan *host.Event) stream, err := hc.StreamEvents(data.JobID, events) if err != nil { return nil, err } go func() { defer stream.Close() for e := range events { switch e.Event { case "start", "stop": jobStatus <- nil return case "error": job, err := hc.GetJob(data.JobID) if err != nil { jobStatus <- err return } if job.Error == nil { jobStatus <- fmt.Errorf("bootstrap: unknown error from host") return } jobStatus <- fmt.Errorf("bootstrap: host error while launching job: %q", *job.Error) return default: } } jobStatus <- fmt.Errorf("bootstrap: host job stream disconnected unexpectedly: %q", stream.Err()) }() if err := hc.AddJob(job); err != nil { return nil, err } return data, <-jobStatus }
func startJob(s *State, hc *cluster.Host, job *host.Job) error { jobStatus := make(chan error) events := make(chan *host.Event) stream, err := hc.StreamEvents(job.ID, events) if err != nil { return err } go func() { defer stream.Close() loop: for { select { case e, ok := <-events: if !ok { break loop } switch e.Event { case "start", "stop": jobStatus <- nil return case "error": job, err := hc.GetJob(job.ID) if err != nil { jobStatus <- err return } if job.Error == nil { jobStatus <- fmt.Errorf("bootstrap: unknown error from host") return } jobStatus <- fmt.Errorf("bootstrap: host error while launching job: %q", *job.Error) return default: } case <-time.After(30 * time.Second): jobStatus <- errors.New("bootstrap: timed out waiting for job event") return } } jobStatus <- fmt.Errorf("bootstrap: host job stream disconnected unexpectedly: %q", stream.Err()) }() if err := hc.AddJob(job); err != nil { return err } return <-jobStatus }
func JobUsingHost(h *cluster.Host, artifact host.Artifact, job *host.Job) *Cmd { command := Job(artifact, job) command.HostID = h.ID() command.host = h return command }
func (s *VolumeSuite) doVolumeTransmitAPI(h0, h1 *cluster.Host, t *c.C) { clus := s.clusterClient(t) // create a volume! vol, err := h0.CreateVolume("default") t.Assert(err, c.IsNil) defer func() { t.Assert(h0.DestroyVolume(vol.ID), c.IsNil) }() // create a job and use it to add data to the volume cmd, service, err := makeIshApp(clus, h0, s.discoverdClient(t), host.ContainerConfig{ Volumes: []host.VolumeBinding{{ Target: "/vol", VolumeID: vol.ID, Writeable: true, }}, }) t.Assert(err, c.IsNil) defer cmd.Kill() resp, err := runIshCommand(service, "echo 'testcontent' > /vol/alpha ; echo $?") t.Assert(err, c.IsNil) t.Assert(resp, c.Equals, "0\n") // take a snapshot snapInfo, err := h0.CreateSnapshot(vol.ID) t.Assert(err, c.IsNil) defer func() { t.Assert(h0.DestroyVolume(snapInfo.ID), c.IsNil) }() // make a volume on another host to yank the snapshot content into vol2, err := h1.CreateVolume("default") t.Assert(err, c.IsNil) defer func() { t.Assert(h1.DestroyVolume(vol2.ID), c.IsNil) }() // transfer the snapshot to the new volume on the other host snapInfo2, err := h1.PullSnapshot(vol2.ID, h0.ID(), snapInfo.ID) t.Assert(err, c.IsNil) defer func() { t.Assert(h1.DestroyVolume(snapInfo2.ID), c.IsNil) }() // start a job on the other host that mounts and inspects the transmitted volume cmd, service, err = makeIshApp(clus, h1, s.discoverdClient(t), host.ContainerConfig{ Volumes: []host.VolumeBinding{{ Target: "/vol", VolumeID: vol2.ID, Writeable: false, }}, }) t.Assert(err, c.IsNil) defer cmd.Kill() // read data back from the volume resp, err = runIshCommand(service, "cat /vol/alpha") t.Assert(err, c.IsNil) t.Assert(resp, c.Equals, "testcontent\n") }
func (c *context) watchHost(h *cluster.Host, ready chan struct{}) { if !c.hosts.Add(h.ID()) { if ready != nil { ready <- struct{}{} } return } defer c.hosts.Remove(h.ID()) g := grohl.NewContext(grohl.Data{"fn": "watchHost", "host.id": h.ID()}) c.hosts.Set(h.ID(), h) g.Log(grohl.Data{"at": "start"}) ch := make(chan *host.Event) h.StreamEvents("all", ch) if ready != nil { ready <- struct{}{} } // Call PutJob in a goroutine so we don't block receiving job events whilst potentially // making multiple requests to the controller (e.g. if the controller is down). // // Use a channel (rather than spawning a goroutine per event) so that events are delivered in order. jobs := make(chan *ct.Job, 10) go func() { for job := range jobs { putJobAttempts.Run(func() error { if err := c.PutJob(job); err != nil { g.Log(grohl.Data{"at": "put_job_error", "job.id": job.ID, "state": job.State, "err": err}) // ignore validation / not found errors if httphelper.IsValidationError(err) || err == controller.ErrNotFound { return nil } return err } g.Log(grohl.Data{"at": "put_job", "job.id": job.ID, "state": job.State}) return nil }) } }() for event := range ch { meta := event.Job.Job.Metadata appID := meta["flynn-controller.app"] releaseID := meta["flynn-controller.release"] jobType := meta["flynn-controller.type"] if appID == "" || releaseID == "" { continue } job := &ct.Job{ ID: event.JobID, AppID: appID, ReleaseID: releaseID, Type: jobType, State: jobState(event), Meta: jobMetaFromMetadata(meta), } g.Log(grohl.Data{"at": "event", "job.id": event.JobID, "event": event.Event}) jobs <- job // get a read lock on the mutex to ensure we are not currently // syncing with the cluster c.mtx.RLock() j := c.jobs.Get(h.ID(), event.JobID) c.mtx.RUnlock() if j == nil { continue } j.startedAt = event.Job.StartedAt if event.Event != "error" && event.Event != "stop" { continue } g.Log(grohl.Data{"at": "remove", "job.id": event.JobID, "event": event.Event}) c.jobs.Remove(h.ID(), event.JobID) go func(event *host.Event) { c.mtx.RLock() j.Formation.RestartJob(jobType, h.ID(), event.JobID) c.mtx.RUnlock() }(event) } // TODO: check error/reconnect }
func (f *ClusterFixer) FixPostgres() error { f.l.Info("checking postgres") service := discoverd.NewService("postgres") leader, _ := service.Leader() if leader == nil || leader.Addr == "" { f.l.Info("no running postgres leader") leader = nil } else { f.l.Info("found running postgres leader") } instances, _ := service.Instances() f.l.Info(fmt.Sprintf("found %d running postgres instances", len(instances))) f.l.Info("getting postgres status") var status *pgmanager.Status if leader != nil && leader.Addr != "" { client := pgmanager.NewClient(leader.Addr) var err error status, err = client.Status() if err != nil { f.l.Error("error getting status from postgres leader", "error", err) } } if status != nil && status.Postgres.ReadWrite { f.l.Info("postgres claims to be read-write") return nil } f.l.Info("getting postgres service metadata") meta, err := discoverd.NewService("postgres").GetMeta() if err != nil { return fmt.Errorf("error getting postgres state from discoverd: %s", err) } var state pgstate.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding postgres state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in postgres state") } f.l.Info("getting postgres primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) job, host, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { if state.Sync != nil { f.l.Error("unable to get primary job info", "error", err) f.l.Info("getting postgres sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) job, host, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { return fmt.Errorf("unable to get postgres primary or sync job details: %s", err) } } else { return fmt.Errorf("unable to get postgres primary job details: %s", err) } } if leader != nil && state.Singleton { return fmt.Errorf("postgres leader is running in singleton mode, unable to fix") } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { f.l.Info("waiting for postgres instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for postgres instance to come up") } }, nil } var wait func() (string, error) have := len(instances) want := 2 if state.Singleton { want = 1 } if have >= want { return fmt.Errorf("already have enough postgres instances, unable to fix") } f.l.Info("attempting to start missing postgres jobs", "want", want, "have", have) if leader == nil { // if no postgres, attempt to start job.ID = cluster.GenerateJobID(host.ID(), "") f.FixJobEnv(job) f.l.Info("starting postgres primary job", "job.id", job.ID) wait, err = waitForInstance(job.ID) if err != nil { return err } if err := host.AddJob(job); err != nil { return fmt.Errorf("error starting postgres primary job on %s: %s", host.ID(), err) } have++ } if want > have { // if not enough postgres instances, start another var secondHost *cluster.Host for _, h := range f.hosts { if h.ID() != host.ID() { secondHost = h break } } if secondHost == nil { // if there are no other hosts, use the same one we put the primary on secondHost = host } job.ID = cluster.GenerateJobID(secondHost.ID(), "") f.FixJobEnv(job) f.l.Info("starting second postgres job", "job.id", job.ID) if wait == nil { wait, err = waitForInstance(job.ID) if err != nil { return err } } if err := utils.ProvisionVolume(secondHost, job); err != nil { return fmt.Errorf("error creating postgres volume on %s: %s", secondHost.ID(), err) } if err := secondHost.AddJob(job); err != nil { return fmt.Errorf("error starting additional postgres job on %s: %s", secondHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil { addr = leader.Addr } f.l.Info("waiting for postgres to come up read-write") return pgmanager.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }
func killJob(app *ct.App, params martini.Params, client cluster.Host, r ResponseHelper) { if err := client.StopJob(params["jobs_id"]); err != nil { r.Error(err) return } }
func (f *ClusterFixer) FixSirenia(svc string) error { log := f.l.New("fn", "FixSirenia", "service", svc) service := discoverd.NewService(svc) instances, _ := service.Instances() leader, _ := service.Leader() log.Info("getting service metadata") meta, err := service.GetMeta() if err != nil { return fmt.Errorf("error getting sirenia state from discoverd: %s", err) } var state state.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in sirenia state") } log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get primary job info") } var syncJob *host.Job var syncHost *cluster.Host if state.Sync != nil { log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get sync job info") } } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { log.Info("waiting for instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for sirenia instance to come up") } }, nil } log.Info("terminating unassigned sirenia instances") outer: for _, i := range instances { if i.Addr == state.Primary.Addr || (state.Sync != nil && i.Addr == state.Sync.Addr) { continue } for _, a := range state.Async { if i.Addr == a.Addr { continue outer } } // job not assigned in state, attempt to terminate it if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok { hostID, err := cluster.ExtractHostID(jobID) if err != nil { log.Error("error extracting host id from jobID", "jobID", jobID, "err", err) } h := f.Host(hostID) if h != nil { if err := h.StopJob(jobID); err != nil { log.Error("error stopping unassigned sirenia job", "jobID", jobID) } } else { log.Error("host not found", "hostID", hostID) } } } isRunning := func(addr string) bool { for _, i := range instances { if i.Addr == addr { return true } } return false } // if the leader isn't currently running then start it using primaryJob/primaryHost var wait func() (string, error) if !isRunning(state.Primary.Addr) { // if we don't have info about the primary job attempt to promote the sync if primaryJob == nil { if syncJob != nil { // set primary job to sync primaryJob = syncJob primaryHost = syncHost // nil out sync job now so we can re-allocate it. syncJob = nil syncHost = nil } else { return fmt.Errorf("neither primary or sync job info available") } } primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "") f.FixJobEnv(primaryJob) log.Info("starting primary job", "job.id", primaryJob.ID) wait, err = waitForInstance(primaryJob.ID) if err != nil { return err } if err := primaryHost.AddJob(primaryJob); err != nil { return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err) } } if !state.Singleton && !isRunning(state.Sync.Addr) { if syncHost == nil { for _, h := range f.hosts { if h.ID() != primaryHost.ID() { syncHost = h break } } if syncHost == nil { // if there are no other hosts, use the same one we put the primary on syncHost = primaryHost } } // if we don't have a sync job then copy the primary job // and provision a new volume if syncJob == nil { syncJob = primaryJob vol := &ct.VolumeReq{Path: "/data"} if _, err := utils.ProvisionVolume(vol, syncHost, syncJob); err != nil { return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err) } } syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "") f.FixJobEnv(syncJob) log.Info("starting sync job", "job.id", syncJob.ID) if wait == nil { wait, err = waitForInstance(syncJob.ID) if err != nil { return err } } if err := syncHost.AddJob(syncJob); err != nil { return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil && leader.Addr != "" { addr = leader.Addr } log.Info("waiting for cluster to come up read-write", "addr", addr) return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }