func (r *Runner) getBuildLog(w http.ResponseWriter, req *http.Request, ps httprouter.Params) { id := ps.ByName("build") b := &Build{} if err := r.db.View(func(tx *bolt.Tx) error { v := tx.Bucket(dbBucket).Get([]byte(id)) if err := json.Unmarshal(v, b); err != nil { return fmt.Errorf("could not decode build %s: %s", v, err) } return nil }); err != nil { http.Error(w, err.Error(), 500) return } // if it's a V1 build, redirect to the log in S3 if b.Version == BuildVersion1 { http.Redirect(w, req, b.LogURL, http.StatusMovedPermanently) return } // if it's a browser, serve the build-log.html template if strings.Contains(req.Header.Get("Accept"), "text/html") { tpl, err := template.ParseFiles(path.Join(args.AssetsDir, "build-log.html")) if err != nil { http.Error(w, err.Error(), 500) return } w.Header().Set("Content-Type", "text/html; charset=utf-8") if err := tpl.Execute(w, b); err != nil { log.Printf("error executing build-log template: %s", err) } return } // serve the build log as either an SSE or plain text stream ch := make(chan string) stream, err := getBuildLogStream(b, ch) if err != nil { http.Error(w, err.Error(), 500) return } if cn, ok := w.(http.CloseNotifier); ok { go func() { <-cn.CloseNotify() stream.Close() }() } else { defer stream.Close() } if strings.Contains(req.Header.Get("Accept"), "text/event-stream") { sse.ServeStream(w, ch, nil) } else { servePlainStream(w, ch) } if err := stream.Err(); err != nil { log.Println("error serving build log stream:", err) } }
// DeleteApp deletes an app. func (c *Client) DeleteApp(appID string) (*ct.AppDeletion, error) { events := make(chan *ct.AppEvent) stream, err := c.ResumingStream("GET", fmt.Sprintf("/apps/%s/events?object_type=%s", appID, ct.EventTypeAppDeletion), events) if err != nil { return nil, err } defer stream.Close() if err := c.Delete(fmt.Sprintf("/apps/%s", appID)); err != nil { return nil, err } select { case event, ok := <-events: if !ok { return nil, stream.Err() } var e ct.AppDeletionEvent if err := json.Unmarshal(event.Data, &e); err != nil { return nil, err } if e.Error != "" { return nil, errors.New(e.Error) } return e.AppDeletion, nil case <-time.After(60 * time.Second): return nil, errors.New("timed out waiting for app deletion") } }
// DeleteRelease deletes a release and any associated file artifacts. func (c *Client) DeleteRelease(appID, releaseID string) (*ct.ReleaseDeletion, error) { events := make(chan *ct.Event) stream, err := c.StreamEvents(ct.StreamEventsOptions{ AppID: appID, ObjectID: releaseID, ObjectTypes: []ct.EventType{ct.EventTypeReleaseDeletion}, }, events) if err != nil { return nil, err } defer stream.Close() if err := c.Delete(fmt.Sprintf("/apps/%s/releases/%s", appID, releaseID), nil); err != nil { return nil, err } select { case event, ok := <-events: if !ok { return nil, stream.Err() } var e ct.ReleaseDeletionEvent if err := json.Unmarshal(event.Data, &e); err != nil { return nil, err } if e.Error != "" { return nil, errors.New(e.Error) } return e.ReleaseDeletion, nil case <-time.After(60 * time.Second): return nil, errors.New("timed out waiting for release deletion") } }
func (s *SchedulerSuite) TestJobMeta(t *c.C) { app, release := s.createApp(t) events := make(chan *ct.JobEvent) stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0, events) t.Assert(err, c.IsNil) defer stream.Close() // start 1 one-off job _, err = s.controllerClient(t).RunJobDetached(app.ID, &ct.NewJob{ ReleaseID: release.ID, Cmd: []string{"sh", "-c", "while true; do echo one-off-job; sleep 1; done"}, Meta: map[string]string{ "foo": "baz", }, }) t.Assert(err, c.IsNil) waitForJobEvents(t, stream, events, jobEvents{"": {"up": 1}}) list, err := s.controllerClient(t).JobList(app.ID) t.Assert(err, c.IsNil) t.Assert(list, c.HasLen, 1) t.Assert(list[0].Meta, c.DeepEquals, map[string]string{ "foo": "baz", }) }
func (s *HostSuite) TestUpdateTags(t *c.C) { events := make(chan *discoverd.Event) stream, err := s.discoverdClient(t).Service("flynn-host").Watch(events) t.Assert(err, c.IsNil) defer stream.Close() nextEvent := func() *discoverd.Event { select { case e, ok := <-events: if !ok { t.Fatal("unexpected close of discoverd stream") } return e case <-time.After(10 * time.Second): t.Fatal("timed out waiting for discoverd event") } return nil } var client *cluster.Host for { e := nextEvent() if e.Kind == discoverd.EventKindUp && client == nil { client = cluster.NewHost(e.Instance.Meta["id"], e.Instance.Addr, nil, nil) } if e.Kind == discoverd.EventKindCurrent { break } } if client == nil { t.Fatal("did not initialize flynn-host client") } t.Assert(client.UpdateTags(map[string]string{"foo": "bar"}), c.IsNil) var meta map[string]string for { e := nextEvent() if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() { meta = e.Instance.Meta break } } t.Assert(meta["tag:foo"], c.Equals, "bar") // setting to empty string should delete the tag t.Assert(client.UpdateTags(map[string]string{"foo": ""}), c.IsNil) for { e := nextEvent() if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() { meta = e.Instance.Meta break } } if _, ok := meta["tag:foo"]; ok { t.Fatal("expected tag to be deleted but is still present") } }
func (s *HostSuite) TestVolumeDeleteOnStop(t *c.C) { hosts, err := s.clusterClient(t).Hosts() t.Assert(err, c.IsNil) t.Assert(hosts, c.Not(c.HasLen), 0) h := hosts[0] // stream job events so we can wait for cleanup events events := make(chan *host.Event) stream, err := h.StreamEvents("all", events) t.Assert(err, c.IsNil) defer stream.Close() waitCleanup := func(jobID string) { timeout := time.After(30 * time.Second) for { select { case event := <-events: if event.JobID == jobID && event.Event == host.JobEventCleanup { return } case <-timeout: t.Fatal("timed out waiting for cleanup event") } } } for _, deleteOnStop := range []bool{true, false} { job := &host.Job{ Config: host.ContainerConfig{ Args: []string{"sh", "-c", "ls -d /foo"}, DisableLog: true, }, } // provision a volume req := &ct.VolumeReq{Path: "/foo", DeleteOnStop: deleteOnStop} vol, err := utils.ProvisionVolume(req, h, job) t.Assert(err, c.IsNil) defer h.DestroyVolume(vol.ID) // run the job cmd := exec.JobUsingCluster(s.clusterClient(t), s.createArtifact(t, "test-apps"), job) cmd.HostID = h.ID() out, err := cmd.CombinedOutput() t.Assert(err, c.IsNil) t.Assert(string(out), c.Equals, "/foo\n") // wait for a cleanup event waitCleanup(job.ID) // check if the volume was deleted or not vol, err = h.GetVolume(vol.ID) if deleteOnStop { t.Assert(hh.IsObjectNotFoundError(err), c.Equals, true) } else { t.Assert(err, c.IsNil) } } }
func main() { defer shutdown.Exit() grohl.AddContext("app", "controller-scheduler") grohl.Log(grohl.Data{"at": "start"}) go startHTTPServer() if period := os.Getenv("BACKOFF_PERIOD"); period != "" { var err error backoffPeriod, err = time.ParseDuration(period) if err != nil { shutdown.Fatal(err) } grohl.Log(grohl.Data{"at": "backoff_period", "period": backoffPeriod.String()}) } cc, err := controller.NewClient("", os.Getenv("AUTH_KEY")) if err != nil { shutdown.Fatal(err) } c := newContext(cc, cluster.NewClient()) c.watchHosts() grohl.Log(grohl.Data{"at": "leaderwait"}) hb, err := discoverd.AddServiceAndRegister("controller-scheduler", ":"+os.Getenv("PORT")) if err != nil { shutdown.Fatal(err) } shutdown.BeforeExit(func() { hb.Close() }) leaders := make(chan *discoverd.Instance) stream, err := discoverd.NewService("controller-scheduler").Leaders(leaders) if err != nil { shutdown.Fatal(err) } for leader := range leaders { if leader.Addr == hb.Addr() { break } } if err := stream.Err(); err != nil { // TODO: handle discoverd errors shutdown.Fatal(err) } stream.Close() // TODO: handle demotion grohl.Log(grohl.Data{"at": "leader"}) // TODO: periodic full cluster sync for anti-entropy c.watchFormations() }
func (h *httpAPI) handleStream(w http.ResponseWriter, params httprouter.Params, kind discoverd.EventKind) { ch := make(chan *discoverd.Event, 64) // TODO: figure out how big this buffer should be stream := h.Store.Subscribe(params.ByName("service"), true, kind, ch) s := sse.NewStream(w, ch, nil) s.Serve() s.Wait() stream.Close() if err := stream.Err(); err != nil { s.CloseWithError(err) } }
func (s *HostSuite) TestAddFailingJob(t *c.C) { // get a host and watch events hosts, err := s.clusterClient(t).Hosts() t.Assert(err, c.IsNil) t.Assert(hosts, c.Not(c.HasLen), 0) h := hosts[0] jobID := random.UUID() events := make(chan *host.Event) stream, err := h.StreamEvents(jobID, events) t.Assert(err, c.IsNil) defer stream.Close() // add a job with a non existent partition job := &host.Job{ ID: jobID, Mountspecs: []*host.Mountspec{{}}, Partition: "nonexistent", } t.Assert(h.AddJob(job), c.IsNil) // check we get a create then error event actual := make(map[host.JobEventType]*host.Event, 2) loop: for { select { case e, ok := <-events: if !ok { t.Fatalf("job event stream closed unexpectedly: %s", stream.Err()) } if _, ok := actual[e.Event]; ok { t.Fatalf("unexpected event: %v", e) } actual[e.Event] = e if len(actual) >= 2 { break loop } case <-time.After(30 * time.Second): t.Fatal("timed out waiting for job event") } } t.Assert(actual[host.JobEventCreate], c.NotNil) e := actual[host.JobEventError] t.Assert(e, c.NotNil) t.Assert(e.Job, c.NotNil) t.Assert(e.Job.Error, c.NotNil) t.Assert(*e.Job.Error, c.Equals, `host: invalid job partition "nonexistent"`) }
func (c *Client) DeployAppRelease(appID, releaseID string) error { d, err := c.CreateDeployment(appID, releaseID) if err != nil { return err } // if initial deploy, just stop here if d.FinishedAt != nil { return nil } events := make(chan *ct.DeploymentEvent) stream, err := c.StreamDeployment(d, events) if err != nil { return err } defer stream.Close() timeout := d.DeployTimeout if timeout == 0 { // although a non-zero timeout is set for all new apps, it // could still be zero in the case of updating a cluster which // doesn't have deploy timeouts set (as the controller // migration may not have run yet) so use the default timeout = ct.DefaultDeployTimeout } outer: for { select { case e, ok := <-events: if !ok { return errors.New("unexpected close of deployment event stream") } switch e.Status { case "complete": break outer case "failed": return e.Err() } case <-time.After(time.Duration(timeout) * time.Second): return errors.New("timed out waiting for deployment completion") } } return nil }
func (s *SchedulerSuite) TestScale(t *c.C) { app, release := s.createApp(t) events := make(chan *ct.JobEvent) stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0, events) t.Assert(err, c.IsNil) defer stream.Close() formation := &ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: make(map[string]int), } current := make(map[string]int) updates := []map[string]int{ {"printer": 2}, {"printer": 3, "crasher": 1}, {"printer": 1}, } for _, procs := range updates { debugf(t, "scaling formation to %v", procs) formation.Processes = procs t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil) expected := make(jobEvents) for typ, count := range procs { diff := count - current[typ] if diff > 0 { expected[typ] = map[string]int{"up": diff} } else { expected[typ] = map[string]int{"down": -diff} } } for typ, count := range current { if _, ok := procs[typ]; !ok { expected[typ] = map[string]int{"down": count} } } waitForJobEvents(t, stream, events, expected) current = procs } }
func (s *SchedulerSuite) TestJobRestartBackoffPolicy(t *c.C) { if testCluster == nil { t.Skip("cannot determine scheduler backoff period") } backoffPeriod := testCluster.BackoffPeriod() startTimeout := 20 * time.Second debugf(t, "job restart backoff period: %s", backoffPeriod) app, release := s.createApp(t) events := make(chan *ct.JobEvent) stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0, events) t.Assert(err, c.IsNil) defer stream.Close() t.Assert(s.controllerClient(t).PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"printer": 1}, }), c.IsNil) _, id := waitForJobEvents(t, stream, events, jobEvents{"printer": {"up": 1}}) // First restart: scheduled immediately s.stopJob(t, id) id = waitForJobRestart(t, stream, events, "printer", startTimeout) // Second restart after 1 * backoffPeriod start := time.Now() s.stopJob(t, id) id = waitForJobRestart(t, stream, events, "printer", backoffPeriod+startTimeout) t.Assert(time.Now().Sub(start) > backoffPeriod, c.Equals, true) // Third restart after 2 * backoffPeriod start = time.Now() s.stopJob(t, id) id = waitForJobRestart(t, stream, events, "printer", 2*backoffPeriod+startTimeout) t.Assert(time.Now().Sub(start) > 2*backoffPeriod, c.Equals, true) // After backoffPeriod has elapsed: scheduled immediately time.Sleep(backoffPeriod) s.stopJob(t, id) waitForJobRestart(t, stream, events, "printer", startTimeout) }
func (s *DeployerSuite) TestRollbackNoService(t *c.C) { // create a running release app, release := s.createRelease(t, "printer", "all-at-once") // deploy a release which will not register the service client := s.controllerClient(t) release.ID = "" printer := release.Processes["printer"] printer.Service = "printer" printer.Ports = []ct.Port{{ Port: 12345, Proto: "tcp", Service: &host.Service{ Name: "printer", Create: true, Check: &host.HealthCheck{ Type: "tcp", Interval: 100 * time.Millisecond, Threshold: 1, KillDown: true, StartTimeout: 100 * time.Millisecond, }, }, }} release.Processes["printer"] = printer t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) // check the deployment fails events := make(chan *ct.DeploymentEvent) stream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer stream.Close() event := s.waitForDeploymentStatus(t, events, "failed") t.Assert(event.Error, c.Equals, "printer process type failed to start, got down job event") s.assertRolledBack(t, deployment, map[string]int{"printer": 2}) // check a new deployment can be created _, err = client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) }
// serveStream creates a subscription and streams out events in SSE format. func (h *Handler) serveStream(w http.ResponseWriter, params httprouter.Params, kind discoverd.EventKind) { // Create a buffered channel to receive events. ch := make(chan *discoverd.Event, StreamBufferSize) // Subscribe to events on the store. service := params.ByName("service") stream := h.Store.Subscribe(service, true, kind, ch) // Create and serve an SSE stream. s := sse.NewStream(w, ch, nil) s.Serve() s.Wait() stream.Close() // Check if there was an error while closing. if err := stream.Err(); err != nil { s.CloseWithError(err) } }
func (c *Client) Instances(service string, timeout time.Duration) ([]*Instance, error) { s := c.Service(service) instances, err := s.Instances() if len(instances) > 0 || err != nil && !IsNotFound(err) { return instances, err } events := make(chan *Event) stream, err := s.Watch(events) if err != nil { return nil, err } defer stream.Close() // get any current instances outer: for event := range events { switch event.Kind { case EventKindCurrent: break outer case EventKindUp: instances = append(instances, event.Instance) } } if len(instances) > 0 { return instances, nil } // wait for an instance to come up for { select { case event, ok := <-events: if !ok { return nil, stream.Err() } if event.Kind != EventKindUp { continue } return []*Instance{event.Instance}, nil case <-time.After(timeout): return nil, ErrTimedOut } } }
func (c *Client) DeployAppRelease(appID, releaseID string, stopWait <-chan struct{}) error { d, err := c.CreateDeployment(appID, releaseID) if err != nil { return err } // if initial deploy, just stop here if d.FinishedAt != nil { return nil } events := make(chan *ct.DeploymentEvent) stream, err := c.StreamDeployment(d, events) if err != nil { return err } defer stream.Close() outer: for { select { case e, ok := <-events: if !ok { return fmt.Errorf("unexpected close of deployment event stream: %s", stream.Err()) } switch e.Status { case "complete": break outer case "failed": return e.Err() } case <-stopWait: return errors.New("deploy wait cancelled") } } return nil }
func (s *Scheduler) unfollowHost(id string) { log := logger.New("fn", "unfollowHost", "host.id", id) stream, ok := s.hostStreams[id] if !ok { log.Warn("ignoring host unfollow due to lack of existing stream") return } log.Info("unfollowing host") for jobID, job := range s.jobs { if job.HostID == id { log.Info("removing job", "job.id", jobID) s.jobs.SetState(job.JobID, JobStateStopped) s.triggerRectify(job.Formation.key()) } } log.Info("closing job event stream") stream.Close() delete(s.hostStreams, id) s.triggerSyncFormations() }
func (c *Client) DeployAppReleaseWithTimeout(appID, releaseID string, timeout time.Duration) error { d, err := c.CreateDeployment(appID, releaseID) if err != nil { return err } // if initial deploy, just stop here if d.FinishedAt != nil { return nil } events := make(chan *ct.DeploymentEvent) stream, err := c.StreamDeployment(d, events) if err != nil { return err } defer stream.Close() outer: for { select { case e, ok := <-events: if !ok { return errors.New("unexpected close of deployment event stream") } switch e.Status { case "complete": break outer case "failed": return e.Err() } case <-time.After(timeout): return errors.New("timed out waiting for deployment completion") } } return nil }
func (s *DeployerSuite) TestRollbackFailedJob(t *c.C) { // create a running release app, release := s.createRelease(t, "printer", "all-at-once") // deploy a release which will fail to start client := s.controllerClient(t) release.ID = "" printer := release.Processes["printer"] printer.Args = []string{"this-is-gonna-fail"} release.Processes["printer"] = printer t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) // check the deployment fails events := make(chan *ct.DeploymentEvent) stream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer stream.Close() event := s.waitForDeploymentStatus(t, events, "failed") t.Assert(event.Error, c.Equals, `deployer: printer job failed to start: exec: "this-is-gonna-fail": executable file not found in $PATH`) s.assertRolledBack(t, deployment, map[string]int{"printer": 2}) }
func (s *HostSuite) TestNotifyOOM(t *c.C) { appID := random.UUID() // subscribe to init log messages from the logaggregator client, err := logaggc.New("") t.Assert(err, c.IsNil) opts := logagg.LogOpts{ Follow: true, StreamTypes: []logagg.StreamType{logagg.StreamTypeInit}, } rc, err := client.GetLog(appID, &opts) t.Assert(err, c.IsNil) defer rc.Close() msgs := make(chan *logaggc.Message) stream := stream.New() defer stream.Close() go func() { defer close(msgs) dec := json.NewDecoder(rc) for { var msg logaggc.Message if err := dec.Decode(&msg); err != nil { stream.Error = err return } select { case msgs <- &msg: case <-stream.StopCh: return } } }() // run the OOM job cmd := exec.CommandUsingCluster( s.clusterClient(t), s.createArtifact(t, "test-apps"), "/bin/oom", ) cmd.Meta = map[string]string{"flynn-controller.app": appID} runErr := make(chan error) go func() { runErr <- cmd.Run() }() // wait for the OOM notification for { select { case err := <-runErr: t.Assert(err, c.IsNil) case msg, ok := <-msgs: if !ok { t.Fatalf("message stream closed unexpectedly: %s", stream.Err()) } t.Log(msg.Msg) if strings.Contains(msg.Msg, "FATAL: a container process was killed due to lack of available memory") { return } case <-time.After(30 * time.Second): t.Fatal("timed out waiting for OOM notification") } } }
func (s *SchedulerSuite) TestJobStatus(t *c.C) { app, release := s.createApp(t) events := make(chan *ct.JobEvent) stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0, events) t.Assert(err, c.IsNil) defer stream.Close() // start 2 formation processes and 1 one-off job t.Assert(s.controllerClient(t).PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"printer": 1, "crasher": 1}, }), c.IsNil) _, err = s.controllerClient(t).RunJobDetached(app.ID, &ct.NewJob{ ReleaseID: release.ID, Cmd: []string{"sh", "-c", "while true; do echo one-off-job; sleep 1; done"}, }) t.Assert(err, c.IsNil) waitForJobEvents(t, stream, events, jobEvents{"printer": {"up": 1}, "crasher": {"up": 1}, "": {"up": 1}}) list, err := s.controllerClient(t).JobList(app.ID) t.Assert(err, c.IsNil) t.Assert(list, c.HasLen, 3) jobs := make(map[string]*ct.Job, len(list)) for _, job := range list { debug(t, job.Type, "job started with ID ", job.ID) jobs[job.Type] = job } // Check jobs are marked as up once started t.Assert(jobs["printer"].State, c.Equals, "up") t.Assert(jobs["crasher"].State, c.Equals, "up") t.Assert(jobs[""].State, c.Equals, "up") // Check that when a formation's job is removed, it is marked as down and a new one is scheduled job := jobs["printer"] s.stopJob(t, job.ID) waitForJobEvents(t, stream, events, jobEvents{"printer": {"down": 1, "up": 1}}) s.checkJobState(t, app.ID, job.ID, "down") list, err = s.controllerClient(t).JobList(app.ID) t.Assert(err, c.IsNil) t.Assert(list, c.HasLen, 4) // Check that when a one-off job is removed, it is marked as down but a new one is not scheduled job = jobs[""] s.stopJob(t, job.ID) waitForJobEvents(t, stream, events, jobEvents{"": {"down": 1}}) s.checkJobState(t, app.ID, job.ID, "down") list, err = s.controllerClient(t).JobList(app.ID) t.Assert(err, c.IsNil) t.Assert(list, c.HasLen, 4) // Check that when a job errors, it is marked as crashed and a new one is started job = jobs["crasher"] s.stopJob(t, job.ID) waitForJobEvents(t, stream, events, jobEvents{"crasher": {"down": 1, "up": 1}}) s.checkJobState(t, app.ID, job.ID, "crashed") list, err = s.controllerClient(t).JobList(app.ID) t.Assert(err, c.IsNil) t.Assert(list, c.HasLen, 5) }
func (s *SchedulerSuite) TestOmniJobs(t *c.C) { if testCluster == nil { t.Skip("cannot boot new hosts") } app, release := s.createApp(t) events := make(chan *ct.JobEvent) stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0, events) t.Assert(err, c.IsNil) defer stream.Close() formation := &ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: make(map[string]int), } current := make(map[string]int) updates := []map[string]int{ {"printer": 2}, {"printer": 3, "omni": 2}, {"printer": 1, "omni": 1}, } for _, procs := range updates { debugf(t, "scaling formation to %v", procs) formation.Processes = procs t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil) expected := make(jobEvents) for typ, count := range procs { diff := count - current[typ] if typ == "omni" { diff *= testCluster.Size() } if diff > 0 { expected[typ] = map[string]int{"up": diff} } else { expected[typ] = map[string]int{"down": -diff} } } for typ, count := range current { if _, ok := procs[typ]; !ok { diff := count if typ == "omni" { diff *= testCluster.Size() } expected[typ] = map[string]int{"down": diff} } } waitForJobEvents(t, stream, events, expected) current = procs } // Check that new hosts get omni jobs newHosts := s.addHosts(t, 2, false) defer s.removeHosts(t, newHosts) waitForJobEvents(t, stream, events, jobEvents{"omni": {"up": 2}}) }
func (s *DeployerSuite) TestOmniProcess(t *c.C) { if testCluster == nil { t.Skip("cannot determine test cluster size") } // create and scale an omni release omniScale := 2 totalJobs := omniScale * testCluster.Size() client := s.controllerClient(t) app, release := s.createApp(t) watcher, err := client.WatchJobEvents(app.Name, release.ID) t.Assert(err, c.IsNil) defer watcher.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"omni": omniScale}, }), c.IsNil) err = watcher.WaitFor(ct.JobEvents{"omni": {ct.JobStateUp: totalJobs}}, scaleTimeout, nil) t.Assert(err, c.IsNil) // deploy using all-at-once and check we get the correct events app.Strategy = "all-at-once" t.Assert(client.UpdateApp(app), c.IsNil) release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events := make(chan *ct.DeploymentEvent) stream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer stream.Close() expected := make([]*ct.Job, 0, 3*totalJobs+1) appendEvents := func(releaseID string, state ct.JobState, count int) { for i := 0; i < count; i++ { expected = append(expected, &ct.Job{ ReleaseID: releaseID, Type: "omni", State: state, }) } } appendEvents(deployment.NewReleaseID, ct.JobStateUp, totalJobs) appendEvents(deployment.OldReleaseID, ct.JobStateDown, totalJobs) s.waitForDeploymentStatus(t, events, "complete") // deploy using one-by-one and check we get the correct events app.Strategy = "one-by-one" t.Assert(client.UpdateApp(app), c.IsNil) release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) // try creating the deployment multiple times to avoid getting a // "Cannot create deploy, one is already in progress" error (there // is no guarantee the previous deploy has finished yet) attempts := attempt.Strategy{Total: 10 * time.Second, Delay: 100 * time.Millisecond} err = attempts.Run(func() (err error) { deployment, err = client.CreateDeployment(app.ID, release.ID) return }) t.Assert(err, c.IsNil) events = make(chan *ct.DeploymentEvent) stream, err = client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) expected = make([]*ct.Job, 0, 4*totalJobs+1) appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size()) appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size()) appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size()) appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size()) s.waitForDeploymentStatus(t, events, "complete") }
func (s *SchedulerSuite) TestControllerRestart(t *c.C) { // get the current controller details app, err := s.controllerClient(t).GetApp("controller") t.Assert(err, c.IsNil) release, err := s.controllerClient(t).GetAppRelease("controller") t.Assert(err, c.IsNil) formation, err := s.controllerClient(t).GetFormation(app.ID, release.ID) t.Assert(err, c.IsNil) list, err := s.controllerClient(t).JobList("controller") t.Assert(err, c.IsNil) var jobs []*ct.Job for _, job := range list { if job.Type == "web" && job.State == "up" { jobs = append(jobs, job) } } t.Assert(jobs, c.HasLen, 2) hostID, jobID, _ := cluster.ParseJobID(jobs[0].ID) t.Assert(hostID, c.Not(c.Equals), "") t.Assert(jobID, c.Not(c.Equals), "") debugf(t, "current controller app[%s] host[%s] job[%s]", app.ID, hostID, jobID) // start a second controller and wait for it to come up events := make(chan *ct.JobEvent) stream, err := s.controllerClient(t).StreamJobEvents("controller", 0, events) t.Assert(err, c.IsNil) debug(t, "scaling the controller up") formation.Processes["web"]++ t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil) lastID, _ := waitForJobEvents(t, stream, events, jobEvents{"web": {"up": 1}}) stream.Close() // get direct client for new controller var client *controller.Client attempts := attempt.Strategy{ Total: 10 * time.Second, Delay: 500 * time.Millisecond, } t.Assert(attempts.Run(func() (err error) { addrs, err := s.discoverdClient(t).Service("flynn-controller").Addrs() if err != nil { return err } if len(addrs) != 3 { return fmt.Errorf("expected 3 controller processes, got %d", len(addrs)) } addr := addrs[2] debug(t, "new controller address: ", addr) client, err = controller.NewClient("http://"+addr, s.clusterConf(t).Key) if err != nil { return err } events = make(chan *ct.JobEvent) stream, err = client.StreamJobEvents("controller", lastID, events) return }), c.IsNil) defer stream.Close() // kill the first controller and check the scheduler brings it back online cc, err := cluster.NewClientWithServices(s.discoverdClient(t).Service) t.Assert(err, c.IsNil) hc, err := cc.DialHost(hostID) t.Assert(err, c.IsNil) debug(t, "stopping job ", jobID) t.Assert(hc.StopJob(jobID), c.IsNil) waitForJobEvents(t, stream, events, jobEvents{"web": {"down": 1, "up": 1}}) // scale back down debug(t, "scaling the controller down") formation.Processes["web"]-- t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil) waitForJobEvents(t, stream, events, jobEvents{"web": {"down": 1}}) // unset the suite's client so other tests use a new client s.controller = nil }
func (s *DeployerSuite) createDeployment(t *c.C, process, strategy, service string) *testDeploy { app, release := s.createRelease(t, process, strategy) if service != "" { debugf(t, "waiting for 2 %s services", service) events := make(chan *discoverd.Event) stream, err := s.discoverdClient(t).Service(service).Watch(events) t.Assert(err, c.IsNil) defer stream.Close() count := 0 loop: for { select { case event, ok := <-events: if !ok { t.Fatalf("service discovery stream closed unexpectedly") } if event.Kind == discoverd.EventKindUp { if id, ok := event.Instance.Meta["FLYNN_RELEASE_ID"]; !ok || id != release.ID { continue } debugf(t, "got %s service up event", service) count++ } if count == 2 { // although the services are up, give them a few more seconds // to make sure the deployer will also see them as up. time.Sleep(5 * time.Second) break loop } case <-time.After(10 * time.Second): t.Fatalf("timed out waiting for %s service to come up", service) } } } client := s.controllerClient(t) jobEvents := make(chan *ct.Job) jobStream, err := client.StreamJobEvents(app.ID, jobEvents) t.Assert(err, c.IsNil) // create a new release for the deployment release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) debugf(t, "created deployment %s", deployment.ID) debugf(t, "deploying from release %s to %s", deployment.OldReleaseID, deployment.NewReleaseID) deployEvents := make(chan *ct.DeploymentEvent) deployStream, err := client.StreamDeployment(deployment, deployEvents) t.Assert(err, c.IsNil) return &testDeploy{ s: s, t: t, deployment: deployment, deployEvents: deployEvents, deployStream: deployStream, jobEvents: jobEvents, jobStream: jobStream, } }
func (c *context) watchHost(h *cluster.Host, ready chan struct{}, stop chan struct{}) { if !c.hosts.Add(h.ID()) { if ready != nil { ready <- struct{}{} } return } defer c.hosts.Remove(h.ID()) g := grohl.NewContext(grohl.Data{"fn": "watchHost", "host.id": h.ID()}) c.hosts.Set(h.ID(), h) g.Log(grohl.Data{"at": "start"}) ch := make(chan *host.Event) stream, err := h.StreamEvents("all", ch) if err != nil { panic(err) } go func() { <-stop stream.Close() }() if ready != nil { ready <- struct{}{} } // Call PutJob in a goroutine so we don't block receiving job events whilst potentially // making multiple requests to the controller (e.g. if the controller is down). // // Use a channel (rather than spawning a goroutine per event) so that events are delivered in order. jobs := make(chan *ct.Job, 10) go func() { for job := range jobs { putJobAttempts.Run(func() error { if err := c.PutJob(job); err != nil { g.Log(grohl.Data{"at": "put_job_error", "job.id": job.ID, "state": job.State, "err": err}) // ignore validation / not found errors if httphelper.IsValidationError(err) || err == controller.ErrNotFound { return nil } return err } g.Log(grohl.Data{"at": "put_job", "job.id": job.ID, "state": job.State}) return nil }) } }() for event := range ch { meta := event.Job.Job.Metadata appID := meta["flynn-controller.app"] releaseID := meta["flynn-controller.release"] jobType := meta["flynn-controller.type"] if appID == "" || releaseID == "" { continue } job := &ct.Job{ ID: event.JobID, AppID: appID, ReleaseID: releaseID, Type: jobType, State: jobState(event), Meta: jobMetaFromMetadata(meta), } g.Log(grohl.Data{"at": "event", "job.id": event.JobID, "event": event.Event}) jobs <- job // get a read lock on the mutex to ensure we are not currently // syncing with the cluster c.mtx.RLock() j := c.jobs.Get(h.ID(), event.JobID) c.mtx.RUnlock() if j == nil { continue } j.startedAt = event.Job.StartedAt if event.Event != "error" && event.Event != "stop" { continue } g.Log(grohl.Data{"at": "remove", "job.id": event.JobID, "event": event.Event}) c.jobs.Remove(h.ID(), event.JobID) go func(event *host.Event) { c.mtx.RLock() j.Formation.RestartJob(jobType, h.ID(), event.JobID) c.mtx.RUnlock() }(event) } // TODO: check error/reconnect }