func waitForJobEvents(t *c.C, events chan *ct.JobEvent, expected jobEvents) (lastID int64, jobID string) { debugf(t, "waiting for job events: %v", expected) actual := make(jobEvents) for { inner: select { case event := <-events: debug(t, "got job event: ", event.Type, event.JobID, event.State) lastID = event.ID jobID = event.JobID if _, ok := actual[event.Type]; !ok { actual[event.Type] = make(map[string]int) } switch event.State { case "up": actual[event.Type]["up"] += 1 case "down", "crashed": actual[event.Type]["down"] += 1 default: break inner } if jobEventsEqual(expected, actual) { return } case <-time.After(60 * time.Second): t.Fatal("timed out waiting for job events: ", expected) } } }
func waitForDeploymentEvents(t *c.C, stream chan *ct.DeploymentEvent, expected []*ct.DeploymentEvent) { debugf(t, "waiting for %d deployment events", len(expected)) actual := make([]*ct.DeploymentEvent, 0, len(expected)) loop: for { select { case e, ok := <-stream: if !ok { t.Fatal("unexpected close of deployment event stream") } actual = append(actual, e) if e.Status == "complete" || e.Status == "failed" { debugf(t, "got deployment event: %s", e.Status) break loop } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) case <-time.After(60 * time.Second): t.Fatal("timed out waiting for deployment event") } } compare := func(t *c.C, i *ct.DeploymentEvent, j *ct.DeploymentEvent) { t.Assert(i.ReleaseID, c.Equals, j.ReleaseID) t.Assert(i.JobType, c.Equals, j.JobType) t.Assert(i.JobState, c.Equals, j.JobState) t.Assert(i.Status, c.Equals, j.Status) t.Assert(i.Error, c.Equals, j.Error) } for i, e := range expected { compare(t, actual[i], e) } }
func (s *HostSuite) TestResourceLimits(t *c.C) { cmd := exec.JobUsingCluster( s.clusterClient(t), exec.DockerImage(imageURIs["test-apps"]), &host.Job{ Config: host.ContainerConfig{Cmd: []string{"sh", "-c", resourceCmd}}, Resources: testResources(), }, ) var out bytes.Buffer cmd.Stdout = &out runErr := make(chan error) go func() { runErr <- cmd.Run() }() select { case err := <-runErr: t.Assert(err, c.IsNil) case <-time.After(30 * time.Second): t.Fatal("timed out waiting for resource limits job") } assertResourceLimits(t, out.String()) }
func (s *HostSuite) TestAttachFinishedInteractiveJob(t *c.C) { cluster := s.clusterClient(t) // run a quick interactive job cmd := exec.CommandUsingCluster(cluster, exec.DockerImage(imageURIs["test-apps"]), "/bin/true") cmd.TTY = true runErr := make(chan error) go func() { runErr <- cmd.Run() }() select { case err := <-runErr: t.Assert(err, c.IsNil) case <-time.After(30 * time.Second): t.Fatal("timed out waiting for interactive job") } h, err := cluster.Host(cmd.HostID) t.Assert(err, c.IsNil) // Getting the logs for the job should fail, as it has none because it was // interactive attachErr := make(chan error) go func() { _, err = h.Attach(&host.AttachReq{JobID: cmd.Job.ID, Flags: host.AttachFlagLogs}, false) attachErr <- err }() select { case err := <-attachErr: t.Assert(err, c.NotNil) case <-time.After(time.Second): t.Error("timed out waiting for attach") } }
func (s *SchedulerSuite) TestTCPApp(t *c.C) { app, _ := s.createApp(t) t.Assert(flynn(t, "/", "-a", app.Name, "scale", "echoer=1"), Succeeds) newRoute := flynn(t, "/", "-a", app.Name, "route", "add", "tcp", "-s", "echo-service") t.Assert(newRoute, Succeeds) t.Assert(newRoute.Output, Matches, `.+ on port \d+`) str := strings.Split(strings.TrimSpace(string(newRoute.Output)), " ") port := str[len(str)-1] // use Attempts to give the processes time to start if err := Attempts.Run(func() error { servAddr := routerIP + ":" + port conn, err := net.Dial("tcp", servAddr) if err != nil { return err } defer conn.Close() msg := []byte("hello there!\n") _, err = conn.Write(msg) if err != nil { return err } reply := make([]byte, len(msg)) _, err = conn.Read(reply) if err != nil { return err } t.Assert(reply, c.DeepEquals, msg) return nil }); err != nil { t.Fatal(err) } }
// TestAppEvents checks that streaming events for an app only receives events // for that particular app. func (s *ControllerSuite) TestAppEvents(t *c.C) { client := s.controllerClient(t) app1, release1 := s.createApp(t) app2, release2 := s.createApp(t) // stream events for app1 events := make(chan *ct.Job) stream, err := client.StreamJobEvents(app1.ID, events) t.Assert(err, c.IsNil) defer stream.Close() runJob := func(appID, releaseID string) { rwc, err := client.RunJobAttached(appID, &ct.NewJob{ ReleaseID: releaseID, Cmd: []string{"/bin/true"}, DisableLog: true, }) t.Assert(err, c.IsNil) rwc.Close() } // generate events for app2 and wait for them watcher, err := client.WatchJobEvents(app2.ID, release2.ID) t.Assert(err, c.IsNil) defer watcher.Close() runJob(app2.ID, release2.ID) t.Assert(watcher.WaitFor( ct.JobEvents{"": {ct.JobStateUp: 1, ct.JobStateDown: 1}}, 10*time.Second, func(e *ct.Job) error { debugf(t, "got %s job event for app2", e.State) return nil }, ), c.IsNil) // generate events for app1 runJob(app1.ID, release1.ID) // check the stream only gets events for app1 for { select { case e, ok := <-events: if !ok { t.Fatal("unexpected close of job event stream") } t.Assert(e.AppID, c.Equals, app1.ID) debugf(t, "got %s job event for app1", e.State) if e.State == ct.JobStateDown { return } case <-time.After(10 * time.Second): t.Fatal("timed out waiting for job events for app1") } } }
func waitForJobRestart(t *c.C, events chan *ct.JobEvent, typ string, timeout time.Duration) string { debug(t, "waiting for job restart") for { select { case event := <-events: debug(t, "got job event: ", event.Type, event.JobID, event.State) if event.Type == typ && event.State == "up" { return event.JobID } case <-time.After(timeout): t.Fatal("timed out waiting for job restart") } } }
func (s *HostSuite) TestUpdateTags(t *c.C) { events := make(chan *discoverd.Event) stream, err := s.discoverdClient(t).Service("flynn-host").Watch(events) t.Assert(err, c.IsNil) defer stream.Close() nextEvent := func() *discoverd.Event { select { case e, ok := <-events: if !ok { t.Fatal("unexpected close of discoverd stream") } return e case <-time.After(10 * time.Second): t.Fatal("timed out waiting for discoverd event") } return nil } var client *cluster.Host for { e := nextEvent() if e.Kind == discoverd.EventKindUp && client == nil { client = cluster.NewHost(e.Instance.Meta["id"], e.Instance.Addr, nil) } if e.Kind == discoverd.EventKindCurrent { break } } if client == nil { t.Fatal("did not initialize flynn-host client") } t.Assert(client.UpdateTags(map[string]string{"foo": "bar"}), c.IsNil) var meta map[string]string for { e := nextEvent() if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() { meta = e.Instance.Meta break } } t.Assert(meta["tag:foo"], c.Equals, "bar") // setting to empty string should delete the tag t.Assert(client.UpdateTags(map[string]string{"foo": ""}), c.IsNil) for { e := nextEvent() if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() { meta = e.Instance.Meta break } } if _, ok := meta["tag:foo"]; ok { t.Fatal("expected tag to be deleted but is still present") } }
func (s *CLISuite) TestRun(t *c.C) { app := s.newCliTestApp(t) defer app.cleanup() // this shouldn't be logged t.Assert(app.sh("echo foo"), Outputs, "foo\n") // drain the events app.waitFor(ct.JobEvents{"": {ct.JobStateUp: 1, ct.JobStateDown: 1}}) // this should be logged due to the --enable-log flag t.Assert(app.flynn("run", "--enable-log", "echo", "hello"), Outputs, "hello\n") app.waitFor(ct.JobEvents{"": {ct.JobStateUp: 1, ct.JobStateDown: 1}}) detached := app.flynn("run", "-d", "echo", "world") t.Assert(detached, Succeeds) t.Assert(detached, c.Not(Outputs), "world\n") id := strings.TrimSpace(detached.Output) jobID := app.waitFor(ct.JobEvents{"": {ct.JobStateUp: 1, ct.JobStateDown: 1}}) t.Assert(jobID, c.Equals, id) t.Assert(app.flynn("log", "--raw-output"), Outputs, "hello\nworld\n") // test stdin and stderr streams := app.flynnCmd("run", "sh", "-c", "cat 1>&2") stdin, err := streams.StdinPipe() t.Assert(err, c.IsNil) go func() { stdin.Write([]byte("goto stderr")) stdin.Close() }() var stderr bytes.Buffer var stdout bytes.Buffer streams.Stderr = &stderr streams.Stdout = &stdout t.Assert(streams.Run(), c.IsNil) t.Assert(stderr.String(), c.Equals, "goto stderr") t.Assert(stdout.String(), c.Equals, "") // test exit code exit := app.sh("exit 42") t.Assert(exit, c.Not(Succeeds)) if msg, ok := exit.Err.(*exec.ExitError); ok { // there is error code code := msg.Sys().(syscall.WaitStatus).ExitStatus() t.Assert(code, c.Equals, 42) } else { t.Fatal("There was no error code!") } }
func (s *HostSuite) TestAddFailingJob(t *c.C) { // get a host and watch events hosts, err := s.clusterClient(t).Hosts() t.Assert(err, c.IsNil) t.Assert(hosts, c.Not(c.HasLen), 0) h := hosts[0] jobID := random.UUID() events := make(chan *host.Event) stream, err := h.StreamEvents(jobID, events) t.Assert(err, c.IsNil) defer stream.Close() // add a job with a non existent partition job := &host.Job{ ID: jobID, ImageArtifact: &host.Artifact{ Type: host.ArtifactTypeDocker, URI: "http://example.com?name=foo&id=bar", }, Partition: "nonexistent", } t.Assert(h.AddJob(job), c.IsNil) // check we get a create then error event actual := make([]*host.Event, 0, 2) loop: for { select { case e, ok := <-events: if !ok { t.Fatalf("job event stream closed unexpectedly: %s", stream.Err()) } actual = append(actual, e) if len(actual) >= 2 { break loop } case <-time.After(30 * time.Second): t.Fatal("timed out waiting for job event") } } t.Assert(actual, c.HasLen, 2) t.Assert(actual[0].Event, c.Equals, host.JobEventCreate) t.Assert(actual[1].Event, c.Equals, host.JobEventError) jobErr := actual[1].Job.Error t.Assert(jobErr, c.NotNil) t.Assert(*jobErr, c.Equals, `host: invalid job partition "nonexistent"`) }
func waitForJobRestart(t *c.C, stream stream.Stream, events chan *ct.JobEvent, typ string, timeout time.Duration) string { debug(t, "waiting for job restart") for { select { case event, ok := <-events: if !ok { t.Fatalf("job event stream closed: %s", stream.Err()) } debug(t, "got job event: ", event.Type, event.JobID, event.State) if event.Type == typ && event.State == "up" { return event.JobID } case <-time.After(timeout): t.Fatal("timed out waiting for job restart") } } }
func (s *CLISuite) TestLimits(t *c.C) { app := s.newCliTestApp(t) t.Assert(app.flynn("limit", "set", "resources", "memory=512MB", "max_fd=12k"), Succeeds) release, err := s.controller.GetAppRelease(app.name) t.Assert(err, c.IsNil) proc, ok := release.Processes["resources"] if !ok { t.Fatal("missing resources process type") } r := proc.Resources t.Assert(*r[resource.TypeMemory].Limit, c.Equals, int64(536870912)) t.Assert(*r[resource.TypeMaxFD].Limit, c.Equals, int64(12000)) cmd := app.flynn("limit", "-t", "resources") t.Assert(cmd, Succeeds) t.Assert(cmd, OutputContains, "memory=512MB") t.Assert(cmd, OutputContains, "max_fd=12000") }
func (s *HostSuite) TestSignalJob(t *c.C) { cluster := s.clusterClient(t) // pick a host to run the job on hosts, err := cluster.Hosts() t.Assert(err, c.IsNil) client := schedutil.PickHost(hosts) // start a signal-service job cmd := exec.JobUsingCluster(cluster, exec.DockerImage(imageURIs["test-apps"]), &host.Job{ Config: host.ContainerConfig{ Cmd: []string{"/bin/signal"}, DisableLog: true, }, }) cmd.HostID = client.ID() var out bytes.Buffer cmd.Stdout = &out t.Assert(cmd.Start(), c.IsNil) _, err = s.discoverdClient(t).Instances("signal-service", 10*time.Second) t.Assert(err, c.IsNil) // send the job a signal t.Assert(client.SignalJob(cmd.Job.ID, int(syscall.SIGTERM)), c.IsNil) // wait for the job to exit done := make(chan error) go func() { done <- cmd.Wait() }() select { case err := <-done: t.Assert(err, c.IsNil) case <-time.After(12 * time.Second): t.Fatal("timed out waiting for job to stop") } // check the output t.Assert(out.String(), c.Equals, "got signal: terminated") }
func (s *SchedulerSuite) addHosts(t *c.C, count int) []string { debugf(t, "adding %d hosts", count) ch := make(chan *host.HostEvent) stream := s.clusterClient(t).StreamHostEvents(ch) defer stream.Close() hosts := make([]string, 0, count) for i := 0; i < count; i++ { res, err := httpClient.PostForm(args.ClusterAPI, url.Values{}) if err != nil { t.Fatal("error in POST request to cluster api:", err) } res.Body.Close() if res.StatusCode != http.StatusOK { t.Fatal("expected 200 status, got", res.Status) } select { case event := <-ch: debug(t, "host added ", event.HostID) hosts = append(hosts, event.HostID) case <-time.After(20 * time.Second): t.Fatal("timed out waiting for new host") } } return hosts }
func (s *SchedulerSuite) TestTCPApp(t *c.C) { app, _ := s.createApp(t) stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0) t.Assert(err, c.IsNil) defer stream.Close() t.Assert(flynn(t, "/", "-a", app.Name, "scale", "echoer=1"), Succeeds) newRoute := flynn(t, "/", "-a", app.Name, "route", "add", "tcp", "-s", "echo-service") t.Assert(newRoute, Succeeds) t.Assert(newRoute.Output, Matches, `.+ on port \d+`) str := strings.Split(strings.TrimSpace(string(newRoute.Output)), " ") port := str[len(str)-1] waitForJobEvents(t, stream.Events, jobEvents{"echoer": {"up": 1}}) // use Attempts to give the processes time to start if err := Attempts.Run(func() error { servAddr := routerIP + ":" + port conn, err := net.Dial("tcp", servAddr) if err != nil { return err } defer conn.Close() echo := random.Bytes(16) _, err = conn.Write(echo) if err != nil { return err } reply := make([]byte, 16) _, err = conn.Read(reply) if err != nil { return err } t.Assert(reply, c.DeepEquals, echo) return nil }); err != nil { t.Fatal(err) } }
func (s *HostSuite) TestDevSHM(t *c.C) { cmd := exec.CommandUsingCluster( s.clusterClient(t), exec.DockerImage(imageURIs["test-apps"]), "sh", "-c", "df -h /dev/shm && echo foo > /dev/shm/asdf", ) var out bytes.Buffer cmd.Stdout = &out cmd.Stderr = &out runErr := make(chan error) go func() { runErr <- cmd.Run() }() select { case err := <-runErr: t.Assert(err, c.IsNil) case <-time.After(30 * time.Second): t.Fatal("timed out waiting for /dev/shm job") } t.Assert(out.String(), c.Equals, "Filesystem Size Used Available Use% Mounted on\ntmpfs 64.0M 0 64.0M 0% /dev/shm\n") }
func (h *Helper) addHosts(t *c.C, count int, vanilla bool) []*tc.Instance { debugf(t, "adding %d hosts", count) // wait for the router-api to start on the host (rather than using // StreamHostEvents) as we wait for router-api when removing the // host (so that could fail if the router-api never starts). events := make(chan *discoverd.Event) stream, err := h.discoverdClient(t).Service("router-api").Watch(events) t.Assert(err, c.IsNil) defer stream.Close() // wait for the current state loop: for { select { case e, ok := <-events: if !ok { t.Fatal("event stream closed unexpectedly") } if e.Kind == discoverd.EventKindCurrent { break loop } case <-time.After(10 * time.Second): t.Fatal("timed out waiting for current service state") } } hosts := make([]*tc.Instance, count) for i := 0; i < count; i++ { host, err := testCluster.AddHost(events, vanilla) t.Assert(err, c.IsNil) debugf(t, "host added: %s", host.ID) hosts[i] = host } return hosts }
func (s *SchedulerSuite) removeHosts(t *c.C, ids []string) { debugf(t, "removing %d hosts", len(ids)) // Wait for router-api services to disappear to indicate host // removal (rather than using StreamHostEvents), so that other // tests won't try and connect to this host via service discovery. set, err := s.discoverdClient(t).NewServiceSet("router-api") t.Assert(err, c.IsNil) defer set.Close() updates := set.Watch(false) defer set.Unwatch(updates) for _, id := range ids { req, err := http.NewRequest("DELETE", args.ClusterAPI+"?host="+id, nil) if err != nil { t.Fatal("error in DELETE request to cluster api:", err) } res, err := httpClient.Do(req) if err != nil { t.Fatal("error in DELETE request to cluster api:", err) } res.Body.Close() if res.StatusCode != http.StatusOK { t.Fatal("expected 200 status, got", res.Status) } loop: for { select { case update := <-updates: if !update.Online { debug(t, "host removed ", update.Addr) break loop } case <-time.After(20 * time.Second): t.Fatal("timed out waiting for host removal") } } } }
func (s *ZDiscoverdSuite) TestDeploy(t *c.C) { // ensure we have enough hosts in the cluster hosts, err := s.clusterClient(t).Hosts() t.Assert(err, c.IsNil) if len(hosts) <= 1 { t.Skip("cannot deploy discoverd in a single node cluster") } client := s.controllerClient(t) app, err := client.GetApp("discoverd") t.Assert(err, c.IsNil) release, err := client.GetAppRelease(app.ID) t.Assert(err, c.IsNil) release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events := make(chan *ct.DeploymentEvent) stream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer stream.Close() loop: for { select { case event, ok := <-events: if !ok { t.Fatal("unexpected close of deployment event stream") } if event.Status == "complete" { debugf(t, "got deployment event: %s", event.Status) break loop } if event.Status == "failed" { t.Fatal("the deployment failed") } debugf(t, "got deployment event: %s %s", event.JobType, event.JobState) case <-time.After(time.Duration(app.DeployTimeout) * time.Second): t.Fatal("timed out waiting for deployment event") } } }
func (s *HostSuite) TestUpdate(t *c.C) { dir := t.MkDir() flynnHost := filepath.Join(dir, "flynn-host") run(t, osexec.Command("cp", args.FlynnHost, flynnHost)) // start flynn-host id := random.String(8) var out bytes.Buffer cmd := osexec.Command( flynnHost, "daemon", "--http-port", "11113", "--state", filepath.Join(dir, "host-state.bolt"), "--id", id, "--backend", "mock", "--vol-provider", "mock", "--volpath", filepath.Join(dir, "volumes"), ) cmd.Stdout = &out cmd.Stderr = &out defer func() { debug(t, "*** flynn-host output ***") debug(t, out.String()) debug(t, "*************************") }() t.Assert(cmd.Start(), c.IsNil) defer cmd.Process.Kill() httpClient := &http.Client{Transport: &http.Transport{Dial: dialer.Retry.Dial}} client := cluster.NewHost(id, "http://127.0.0.1:11113", httpClient) // exec a program which exits straight away _, err := client.Update("/bin/true") t.Assert(err, c.NotNil) status, err := client.GetStatus() t.Assert(err, c.IsNil) t.Assert(status.ID, c.Equals, id) t.Assert(status.PID, c.Equals, cmd.Process.Pid) // exec a program which reads the control socket but then exits _, err = client.Update("/bin/bash", "-c", "<&4; exit") t.Assert(err, c.NotNil) status, err = client.GetStatus() t.Assert(err, c.IsNil) t.Assert(status.ID, c.Equals, id) t.Assert(status.PID, c.Equals, cmd.Process.Pid) // exec flynn-host and check we get the status from the new daemon pid, err := client.Update( flynnHost, "daemon", "--http-port", "11113", "--state", filepath.Join(dir, "host-state.bolt"), "--id", id, "--backend", "mock", "--vol-provider", "mock", "--volpath", filepath.Join(dir, "volumes"), ) t.Assert(err, c.IsNil) defer syscall.Kill(pid, syscall.SIGKILL) done := make(chan struct{}) go func() { cmd.Process.Signal(syscall.SIGTERM) syscall.Wait4(cmd.Process.Pid, nil, 0, nil) close(done) }() select { case <-done: case <-time.After(15 * time.Second): t.Fatal("timed out waiting for flynn-host daemon to exit") } // client.GetStatus intermittently returns io.EOF right after the update. We // don't currently understand why (likely due to the way the listener is // passed around), so for now just retry the request. // // TODO(lmars): figure out why and remove this loop. delay := 100 * time.Millisecond for start := time.Now(); time.Since(start) < 10*time.Second; time.Sleep(delay) { status, err = client.GetStatus() if e, ok := err.(*url.Error); ok && strings.Contains(e.Err.Error(), "EOF") { debugf(t, "got io.EOF from flynn-host, trying again in %s", delay) continue } break } t.Assert(err, c.IsNil) t.Assert(status.ID, c.Equals, id) t.Assert(status.PID, c.Equals, pid) }
func testSireniaDeploy(client controller.Client, disc *discoverd.Client, t *c.C, d *sireniaDeploy) { // create app app := &ct.App{Name: d.name, Strategy: "sirenia"} t.Assert(client.CreateApp(app), c.IsNil) // copy release from default app release, err := client.GetAppRelease(d.db.appName) t.Assert(err, c.IsNil) release.ID = "" release.Env[d.db.hostKey] = fmt.Sprintf("leader.%s.discoverd", d.name) release.Env[d.db.serviceKey] = d.name procName := release.Env["SIRENIA_PROCESS"] proc := release.Processes[procName] delete(proc.Env, "SINGLETON") proc.Service = d.name release.Processes[procName] = proc t.Assert(client.CreateRelease(release), c.IsNil) t.Assert(client.SetAppRelease(app.ID, release.ID), c.IsNil) oldRelease := release.ID // create formation discEvents := make(chan *discoverd.Event) discService := disc.Service(d.name) discStream, err := discService.Watch(discEvents) t.Assert(err, c.IsNil) defer discStream.Close() jobEvents := make(chan *ct.Job) jobStream, err := client.StreamJobEvents(d.name, jobEvents) t.Assert(err, c.IsNil) defer jobStream.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{procName: d.sireniaJobs, "web": d.webJobs}, }), c.IsNil) // watch cluster state changes type stateChange struct { state *state.State err error } stateCh := make(chan stateChange) go func() { for event := range discEvents { if event.Kind != discoverd.EventKindServiceMeta { continue } var state state.State if err := json.Unmarshal(event.ServiceMeta.Data, &state); err != nil { stateCh <- stateChange{err: err} return } primary := "" if state.Primary != nil { primary = state.Primary.Addr } sync := "" if state.Sync != nil { sync = state.Sync.Addr } var async []string for _, a := range state.Async { async = append(async, a.Addr) } debugf(t, "got cluster state: index=%d primary=%s sync=%s async=%s", event.ServiceMeta.Index, primary, sync, strings.Join(async, ",")) stateCh <- stateChange{state: &state} } }() // wait for correct cluster state and number of web processes var sireniaState state.State var webJobs int ready := func() bool { if webJobs != d.webJobs { return false } if sireniaState.Primary == nil { return false } if d.sireniaJobs > 1 && sireniaState.Sync == nil { return false } if d.sireniaJobs > 2 && len(sireniaState.Async) != d.sireniaJobs-2 { return false } return true } for { if ready() { break } select { case s := <-stateCh: t.Assert(s.err, c.IsNil) sireniaState = *s.state case e, ok := <-jobEvents: if !ok { t.Fatalf("job event stream closed: %s", jobStream.Err()) } debugf(t, "got job event: %s %s %s", e.Type, e.ID, e.State) if e.Type == "web" && e.State == ct.JobStateUp { webJobs++ } case <-time.After(30 * time.Second): t.Fatal("timed out waiting for formation") } } // wait for the primary to indicate downstream replication sync debug(t, "waiting for primary to indicate downstream replication sync") sireniaClient := sc.NewClient(sireniaState.Primary.Addr) t.Assert(sireniaClient.WaitForReplSync(sireniaState.Sync, 1*time.Minute), c.IsNil) // connect to the db so we can test writes d.db.initDb(t, release, d) // check currently writeable d.db.assertWriteable(t, release, d) // check a deploy completes with expected cluster state changes release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) newRelease := release.ID deployment, err := client.CreateDeployment(app.ID, newRelease) t.Assert(err, c.IsNil) deployEvents := make(chan *ct.DeploymentEvent) deployStream, err := client.StreamDeployment(deployment, deployEvents) t.Assert(err, c.IsNil) defer deployStream.Close() // assertNextState checks that the next state received is in the remaining states // that were expected, so handles the fact that some states don't happen, but the // states that do happen are expected and in-order. assertNextState := func(remaining []expectedSireniaState) int { var state state.State loop: for { select { case s := <-stateCh: t.Assert(s.err, c.IsNil) if len(s.state.Async) < d.expectedAsyncs() { // we shouldn't usually receive states with less asyncs than // expected, but they can occur as an intermediate state between // two expected states (e.g. when a sync does a takeover at the // same time as a new async is started) so just ignore them. debug(t, "ignoring state with too few asyncs") continue } state = *s.state break loop case <-time.After(60 * time.Second): t.Fatal("timed out waiting for cluster state") } } if state.Primary == nil { t.Fatal("no primary configured") } log := func(format string, v ...interface{}) { debugf(t, "skipping expected state: %s", fmt.Sprintf(format, v...)) } outer: for i, expected := range remaining { if state.Primary.Meta["FLYNN_RELEASE_ID"] != expected.Primary { log("primary has incorrect release") continue } if state.Sync == nil { if expected.Sync == "" { return i } log("state has no sync node") continue } if state.Sync.Meta["FLYNN_RELEASE_ID"] != expected.Sync { log("sync has incorrect release") continue } if state.Async == nil { if expected.Async == nil { return i } log("state has no async nodes") continue } if len(state.Async) != len(expected.Async) { log("expected %d asyncs, got %d", len(expected.Async), len(state.Async)) continue } for i, release := range expected.Async { if state.Async[i].Meta["FLYNN_RELEASE_ID"] != release { log("async[%d] has incorrect release", i) continue outer } } return i } t.Fatal("unexpected state") return -1 } expected := d.expected(oldRelease, newRelease) var expectedIndex, newWebJobs int loop: for { select { case e, ok := <-deployEvents: if !ok { t.Fatal("unexpected close of deployment event stream") } switch e.Status { case "complete": break loop case "failed": t.Fatalf("deployment failed: %s", e.Error) } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) if e.JobState != ct.JobStateUp && e.JobState != ct.JobStateDown { continue } switch e.JobType { case procName: // move on if we have seen all the expected events if expectedIndex >= len(expected) { continue } skipped := assertNextState(expected[expectedIndex:]) expectedIndex += 1 + skipped case "web": if e.JobState == ct.JobStateUp && e.ReleaseID == newRelease { newWebJobs++ } } case <-time.After(2 * time.Minute): t.Fatal("timed out waiting for deployment") } } // check we have the correct number of new web jobs t.Assert(newWebJobs, c.Equals, d.webJobs) // check writeable now deploy is complete d.db.assertWriteable(t, release, d) }
func (s *HostUpdateSuite) TestUpdateLogs(t *c.C) { if testCluster == nil { t.Skip("cannot boot new hosts") } instance := s.addHost(t) defer s.removeHost(t, instance) httpClient := &http.Client{Transport: &http.Transport{Dial: dialer.Retry.Dial}} client := cluster.NewHost(instance.ID, fmt.Sprintf("http://%s:1113", instance.IP), httpClient) // start partial logger job cmd := exec.JobUsingHost( client, exec.DockerImage(imageURIs["test-apps"]), &host.Job{ Config: host.ContainerConfig{Cmd: []string{"/bin/partial-logger"}}, Metadata: map[string]string{ "flynn-controller.app": "partial-logger", }, }, ) t.Assert(cmd.Start(), c.IsNil) defer cmd.Kill() // wait for partial line _, err := s.discoverdClient(t).Instances("partial-logger", 10*time.Second) t.Assert(err, c.IsNil) // update flynn-host pid, err := client.Update("/usr/local/bin/flynn-host", "daemon", "--id", cmd.HostID) t.Assert(err, c.IsNil) // update the pid file so removeHost works t.Assert(instance.Run(fmt.Sprintf("echo -n %d | sudo tee /var/run/flynn-host.pid", pid), nil), c.IsNil) // finish logging t.Assert(client.SignalJob(cmd.Job.ID, int(syscall.SIGUSR1)), c.IsNil) // check we get a single log line logc, err := logaggc.New("") t.Assert(err, c.IsNil) log, err := logc.GetLog("partial-logger", &logaggc.LogOpts{Follow: true}) t.Assert(err, c.IsNil) defer log.Close() msgs := make(chan *logaggc.Message) go func() { defer close(msgs) dec := json.NewDecoder(log) for { var msg logaggc.Message if err := dec.Decode(&msg); err != nil { debugf(t, "error decoding message: %s", err) return } msgs <- &msg } }() for { select { case msg, ok := <-msgs: if !ok { t.Fatal("error getting log") } if msg.Stream == "stdout" { t.Assert(msg.Msg, c.Equals, "hello world") return } case <-time.After(10 * time.Second): t.Fatal("timed out waiting for log") } } }
func (s *DomainMigrationSuite) migrateDomain(t *c.C, dm *ct.DomainMigration) { debugf(t, "migrating domain from %s to %s", dm.OldDomain, dm.Domain) client := s.controllerClient(t) events := make(chan *ct.Event) stream, err := client.StreamEvents(controller.StreamEventsOptions{ ObjectTypes: []ct.EventType{ct.EventTypeDomainMigration}, }, events) t.Assert(err, c.IsNil) defer stream.Close() prevRouterRelease, err := client.GetAppRelease("router") t.Assert(err, c.IsNil) err = client.PutDomain(dm) t.Assert(err, c.IsNil) waitEvent := func(typ string, timeout time.Duration) (event ct.DomainMigrationEvent) { debugf(t, "waiting for %s domain migration event", typ) var e *ct.Event var ok bool select { case e, ok = <-events: if !ok { t.Fatal("event stream closed unexpectedly") } debugf(t, "got %s domain migration event", typ) case <-time.After(timeout): t.Fatalf("timed out waiting for %s domain migration event", typ) } t.Assert(e.Data, c.NotNil) t.Assert(json.Unmarshal(e.Data, &event), c.IsNil) return } // created event := waitEvent("initial", 2*time.Minute) t.Assert(event.Error, c.Equals, "") t.Assert(event.DomainMigration, c.NotNil) t.Assert(event.DomainMigration.ID, c.Equals, dm.ID) t.Assert(event.DomainMigration.OldDomain, c.Equals, dm.OldDomain) t.Assert(event.DomainMigration.Domain, c.Equals, dm.Domain) t.Assert(event.DomainMigration.TLSCert, c.IsNil) t.Assert(event.DomainMigration.OldTLSCert, c.NotNil) t.Assert(event.DomainMigration.CreatedAt, c.NotNil) t.Assert(event.DomainMigration.CreatedAt.Equal(*dm.CreatedAt), c.Equals, true) t.Assert(event.DomainMigration.FinishedAt, c.IsNil) // complete event = waitEvent("final", 3*time.Minute) t.Assert(event.Error, c.Equals, "") t.Assert(event.DomainMigration, c.NotNil) t.Assert(event.DomainMigration.ID, c.Equals, dm.ID) t.Assert(event.DomainMigration.OldDomain, c.Equals, dm.OldDomain) t.Assert(event.DomainMigration.Domain, c.Equals, dm.Domain) t.Assert(event.DomainMigration.TLSCert, c.NotNil) t.Assert(event.DomainMigration.OldTLSCert, c.NotNil) t.Assert(event.DomainMigration.CreatedAt, c.NotNil) t.Assert(event.DomainMigration.CreatedAt.Equal(*dm.CreatedAt), c.Equals, true) t.Assert(event.DomainMigration.FinishedAt, c.NotNil) cert := event.DomainMigration.TLSCert controllerRelease, err := client.GetAppRelease("controller") t.Assert(err, c.IsNil) t.Assert(controllerRelease.Env["DEFAULT_ROUTE_DOMAIN"], c.Equals, dm.Domain) t.Assert(controllerRelease.Env["CA_CERT"], c.Equals, cert.CACert) routerRelease, err := client.GetAppRelease("router") t.Assert(err, c.IsNil) t.Assert(routerRelease.Env["TLSCERT"], c.Equals, cert.Cert) t.Assert(routerRelease.Env["TLSKEY"], c.Not(c.Equals), "") t.Assert(routerRelease.Env["TLSKEY"], c.Not(c.Equals), prevRouterRelease.Env["TLSKEY"]) dashboardRelease, err := client.GetAppRelease("dashboard") t.Assert(err, c.IsNil) t.Assert(dashboardRelease.Env["DEFAULT_ROUTE_DOMAIN"], c.Equals, dm.Domain) t.Assert(dashboardRelease.Env["CONTROLLER_DOMAIN"], c.Equals, fmt.Sprintf("controller.%s", dm.Domain)) t.Assert(dashboardRelease.Env["URL"], c.Equals, fmt.Sprintf("dashboard.%s", dm.Domain)) t.Assert(dashboardRelease.Env["CA_CERT"], c.Equals, cert.CACert) var doPing func(string, int) doPing = func(component string, retriesRemaining int) { url := fmt.Sprintf("http://%s.%s/ping", component, dm.Domain) res, err := (&http.Client{}).Get(url) if (err != nil || res.StatusCode != 200) && retriesRemaining > 0 { time.Sleep(100 * time.Millisecond) doPing(component, retriesRemaining-1) return } t.Assert(err, c.IsNil) t.Assert(res.StatusCode, c.Equals, 200, c.Commentf("failed to ping %s", component)) } doPing("controller", 3) doPing("dashboard", 3) }
func (s *SchedulerSuite) TestDeployController(t *c.C) { // get the current controller release client := s.controllerClient(t) app, err := client.GetApp("controller") t.Assert(err, c.IsNil) release, err := client.GetAppRelease(app.ID) t.Assert(err, c.IsNil) // get the current controller formation formation, err := client.GetFormation(app.ID, release.ID) t.Assert(err, c.IsNil) // create a controller deployment release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events := make(chan *ct.DeploymentEvent) eventStream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer eventStream.Close() // wait for the deploy to complete (this doesn't wait for specific events // due to the fact that when the deployer deploys itself, some events will // not get sent) loop: for { select { case e, ok := <-events: if !ok { t.Fatal("unexpected close of deployment event stream") } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) switch e.Status { case "complete": break loop case "failed": t.Fatal("the deployment failed") } case <-time.After(time.Duration(app.DeployTimeout) * time.Second): t.Fatal("timed out waiting for the deploy to complete") } } // check the correct controller jobs are running hosts, err := s.clusterClient(t).Hosts() t.Assert(err, c.IsNil) t.Assert(hosts, c.Not(c.HasLen), 0) actual := make(map[string]map[string]int) for _, h := range hosts { jobs, err := h.ListJobs() t.Assert(err, c.IsNil) for _, job := range jobs { if job.Status != host.StatusRunning { continue } appID := job.Job.Metadata["flynn-controller.app"] if appID != app.ID { continue } releaseID := job.Job.Metadata["flynn-controller.release"] if _, ok := actual[releaseID]; !ok { actual[releaseID] = make(map[string]int) } typ := job.Job.Metadata["flynn-controller.type"] actual[releaseID][typ]++ } } expected := map[string]map[string]int{release.ID: { "web": formation.Processes["web"], "worker": formation.Processes["worker"], "scheduler": len(hosts), }} t.Assert(actual, c.DeepEquals, expected) }
func (s *SchedulerSuite) TestRollbackController(t *c.C) { // get the current controller release client := s.controllerClient(t) app, err := client.GetApp("controller") t.Assert(err, c.IsNil) release, err := client.GetAppRelease(app.ID) t.Assert(err, c.IsNil) watcher, err := s.controllerClient(t).WatchJobEvents(app.ID, release.ID) t.Assert(err, c.IsNil) defer watcher.Close() // get the current controller formation formation, err := client.GetFormation(app.ID, release.ID) t.Assert(err, c.IsNil) currentReleaseID := release.ID // create a controller deployment that will fail release.ID = "" worker := release.Processes["worker"] worker.Entrypoint = []string{"/i/dont/exist"} release.Processes["worker"] = worker t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events := make(chan *ct.DeploymentEvent) eventStream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer eventStream.Close() // wait for the deploy to fail loop: for { select { case e, ok := <-events: if !ok { t.Fatal("unexpected close of deployment event stream") } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) switch e.Status { case "complete": t.Fatal("the deployment succeeded when it should have failed") case "failed": break loop } case <-time.After(2 * time.Minute): t.Fatal("timed out waiting for the deploy to fail") } } // wait for jobs to come back up hosts, err := s.clusterClient(t).Hosts() expected := map[string]map[ct.JobState]int{ "web": {ct.JobStateUp: formation.Processes["web"]}, "scheduler": {ct.JobStateUp: len(hosts)}, } t.Assert(watcher.WaitFor(expected, scaleTimeout, nil), c.IsNil) // check the correct controller jobs are running t.Assert(err, c.IsNil) t.Assert(hosts, c.Not(c.HasLen), 0) actual := make(map[string]map[string]int) for _, h := range hosts { jobs, err := h.ListJobs() t.Assert(err, c.IsNil) for _, job := range jobs { if job.Status != host.StatusRunning { continue } appID := job.Job.Metadata["flynn-controller.app"] if appID != app.ID { continue } releaseID := job.Job.Metadata["flynn-controller.release"] if releaseID != currentReleaseID { continue } if _, ok := actual[releaseID]; !ok { actual[releaseID] = make(map[string]int) } typ := job.Job.Metadata["flynn-controller.type"] actual[releaseID][typ]++ } } t.Assert(actual, c.DeepEquals, map[string]map[string]int{ currentReleaseID: { "web": formation.Processes["web"], "scheduler": formation.Processes["scheduler"] * len(hosts), "worker": formation.Processes["worker"], }, }) }
func (s *SchedulerSuite) TestDeployController(t *c.C) { if testCluster == nil { t.Skip("cannot determine test cluster size") } // get the current controller release client := s.controllerClient(t) app, err := client.GetApp("controller") t.Assert(err, c.IsNil) release, err := client.GetAppRelease(app.ID) t.Assert(err, c.IsNil) // create a controller deployment release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) // use a function to create the event stream as a new stream will be needed // after deploying the controller var events chan *ct.DeploymentEvent var eventStream stream.Stream connectStream := func() { events = make(chan *ct.DeploymentEvent) err := attempt.Strategy{ Total: 10 * time.Second, Delay: 500 * time.Millisecond, }.Run(func() (err error) { eventStream, err = client.StreamDeployment(deployment.ID, events) return }) t.Assert(err, c.IsNil) } connectStream() defer eventStream.Close() // wait for the deploy to complete (this doesn't wait for specific events // due to the fact that when the deployer deploys itself, some events will // not get sent) loop: for { select { case e, ok := <-events: if !ok { // reconnect the stream as it may of been closed // due to the controller being deployed debug(t, "reconnecting deployment event stream") connectStream() continue } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) switch e.Status { case "complete": break loop case "failed": t.Fatal("the deployment failed") } case <-time.After(60 * time.Second): t.Fatal("timed out waiting for the deploy to complete") } } // check the correct controller jobs are running hosts, err := s.clusterClient(t).ListHosts() t.Assert(err, c.IsNil) actual := make(map[string]map[string]int) for _, host := range hosts { for _, job := range host.Jobs { appID := job.Metadata["flynn-controller.app"] if appID != app.ID { continue } releaseID := job.Metadata["flynn-controller.release"] if _, ok := actual[releaseID]; !ok { actual[releaseID] = make(map[string]int) } typ := job.Metadata["flynn-controller.type"] actual[releaseID][typ]++ } } expected := map[string]map[string]int{release.ID: { "web": 2, "deployer": 2, "scheduler": testCluster.Size(), }} t.Assert(actual, c.DeepEquals, expected) }
func (s *SchedulerSuite) TestControllerRestart(t *c.C) { // get the current controller details app, err := s.controllerClient(t).GetApp("controller") t.Assert(err, c.IsNil) release, err := s.controllerClient(t).GetAppRelease("controller") t.Assert(err, c.IsNil) formation, err := s.controllerClient(t).GetFormation(app.ID, release.ID) t.Assert(err, c.IsNil) list, err := s.controllerClient(t).JobList("controller") t.Assert(err, c.IsNil) var jobs []*ct.Job for _, job := range list { if job.Type == "web" && job.State == ct.JobStateUp { jobs = append(jobs, job) } } t.Assert(jobs, c.HasLen, formation.Processes["web"]) jobID := jobs[0].ID hostID, _ := cluster.ExtractHostID(jobID) t.Assert(hostID, c.Not(c.Equals), "") debugf(t, "current controller app[%s] host[%s] job[%s]", app.ID, hostID, jobID) // subscribe to service events, wait for current event events := make(chan *discoverd.Event) stream, err := s.discoverdClient(t).Service("controller").Watch(events) t.Assert(err, c.IsNil) defer stream.Close() type serviceEvents map[discoverd.EventKind]int wait := func(expected serviceEvents) { actual := make(serviceEvents) outer: for { select { case event := <-events: actual[event.Kind]++ for kind, count := range expected { if actual[kind] != count { continue outer } } return case <-time.After(scaleTimeout): t.Fatal("timed out waiting for controller service event") } } } wait(serviceEvents{discoverd.EventKindCurrent: 1}) // start another controller and wait for it to come up debug(t, "scaling the controller up") formation.Processes["web"]++ t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil) wait(serviceEvents{discoverd.EventKindUp: 1}) // kill the first controller and check the scheduler brings it back online cc := cluster.NewClientWithServices(s.discoverdClient(t).Service) hc, err := cc.Host(hostID) t.Assert(err, c.IsNil) debug(t, "stopping job ", jobID) t.Assert(hc.StopJob(jobID), c.IsNil) wait(serviceEvents{discoverd.EventKindUp: 1, discoverd.EventKindDown: 1}) // scale back down debug(t, "scaling the controller down") formation.Processes["web"]-- t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil) wait(serviceEvents{discoverd.EventKindDown: 1}) // unset the suite's client so other tests use a new client s.controller = nil }
func (s *CLISuite) TestSlugReleaseGarbageCollection(t *c.C) { client := s.controllerClient(t) // create app with gc.max_inactive_slug_releases=3 maxInactiveSlugReleases := 3 app := &ct.App{Meta: map[string]string{"gc.max_inactive_slug_releases": strconv.Itoa(maxInactiveSlugReleases)}} t.Assert(client.CreateApp(app), c.IsNil) // create an image artifact imageArtifact := &ct.Artifact{Type: host.ArtifactTypeDocker, URI: imageURIs["test-apps"]} t.Assert(client.CreateArtifact(imageArtifact), c.IsNil) // create 5 slug artifacts var slug bytes.Buffer gz := gzip.NewWriter(&slug) t.Assert(tar.NewWriter(gz).Close(), c.IsNil) t.Assert(gz.Close(), c.IsNil) slugs := []string{ "http://blobstore.discoverd/1/slug.tgz", "http://blobstore.discoverd/2/slug.tgz", "http://blobstore.discoverd/3/slug.tgz", "http://blobstore.discoverd/4/slug.tgz", "http://blobstore.discoverd/5/slug.tgz", } slugArtifacts := make([]*ct.Artifact, len(slugs)) for i, uri := range slugs { req, err := http.NewRequest("PUT", uri, bytes.NewReader(slug.Bytes())) t.Assert(err, c.IsNil) res, err := http.DefaultClient.Do(req) t.Assert(err, c.IsNil) res.Body.Close() t.Assert(res.StatusCode, c.Equals, http.StatusOK) artifact := &ct.Artifact{ Type: host.ArtifactTypeFile, URI: uri, Meta: map[string]string{"blobstore": "true"}, } t.Assert(client.CreateArtifact(artifact), c.IsNil) slugArtifacts[i] = artifact } // create 6 releases, the second being scaled up and having the // same slug as the third (so prevents the slug being deleted) releases := make([]*ct.Release, 6) for i, r := range []struct { slug *ct.Artifact active bool }{ {slugArtifacts[0], false}, {slugArtifacts[1], true}, {slugArtifacts[1], false}, {slugArtifacts[2], false}, {slugArtifacts[3], false}, {slugArtifacts[4], false}, } { release := &ct.Release{ ArtifactIDs: []string{imageArtifact.ID, r.slug.ID}, Processes: map[string]ct.ProcessType{ "app": {Cmd: []string{"/bin/pingserv"}, Ports: []ct.Port{{Proto: "tcp"}}}, }, } t.Assert(client.CreateRelease(release), c.IsNil) procs := map[string]int{"app": 0} if r.active { procs["app"] = 1 } t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: procs, }), c.IsNil) releases[i] = release } // scale the last release so we can deploy it lastRelease := releases[len(releases)-1] watcher, err := client.WatchJobEvents(app.ID, lastRelease.ID) t.Assert(err, c.IsNil) defer watcher.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: lastRelease.ID, Processes: map[string]int{"app": 1}, }), c.IsNil) t.Assert(watcher.WaitFor(ct.JobEvents{"app": ct.JobUpEvents(1)}, scaleTimeout, nil), c.IsNil) t.Assert(client.SetAppRelease(app.ID, lastRelease.ID), c.IsNil) // subscribe to garbage collection events gcEvents := make(chan *ct.Event) stream, err := client.StreamEvents(ct.StreamEventsOptions{ AppID: app.ID, ObjectTypes: []ct.EventType{ct.EventTypeAppGarbageCollection}, }, gcEvents) t.Assert(err, c.IsNil) defer stream.Close() // deploy a new release with the same slug as the last release newRelease := *lastRelease newRelease.ID = "" t.Assert(client.CreateRelease(&newRelease), c.IsNil) t.Assert(client.DeployAppRelease(app.ID, newRelease.ID), c.IsNil) // wait for garbage collection select { case event, ok := <-gcEvents: if !ok { t.Fatalf("event stream closed unexpectedly: %s", stream.Err()) } var e ct.AppGarbageCollectionEvent t.Assert(json.Unmarshal(event.Data, &e), c.IsNil) if e.Error != "" { t.Fatalf("garbage collection failed: %s", e.Error) } case <-time.After(60 * time.Second): t.Fatal("timed out waiting for garbage collection") } // check we have 4 distinct slug releases (so 5 in total, only 3 are // inactive) list, err := client.AppReleaseList(app.ID) t.Assert(err, c.IsNil) t.Assert(list, c.HasLen, maxInactiveSlugReleases+2) distinctSlugs := make(map[string]struct{}, len(list)) for _, release := range list { files := release.FileArtifactIDs() t.Assert(files, c.HasLen, 1) distinctSlugs[files[0]] = struct{}{} } t.Assert(distinctSlugs, c.HasLen, maxInactiveSlugReleases+1) // check the first and third releases got deleted, but the rest remain assertDeleted := func(release *ct.Release, deleted bool) { _, err := client.GetRelease(release.ID) if deleted { t.Assert(err, c.Equals, controller.ErrNotFound) } else { t.Assert(err, c.IsNil) } } assertDeleted(releases[0], true) assertDeleted(releases[1], false) assertDeleted(releases[2], true) assertDeleted(releases[3], false) assertDeleted(releases[4], false) assertDeleted(releases[5], false) assertDeleted(&newRelease, false) // check the first slug got deleted, but the rest remain s.assertURI(t, slugs[0], http.StatusNotFound) for i := 1; i < len(slugs); i++ { s.assertURI(t, slugs[i], http.StatusOK) } }
func (s *PostgresSuite) testDeploy(t *c.C, d *pgDeploy) { // create postgres app client := s.controllerClient(t) app := &ct.App{Name: d.name, Strategy: "postgres"} t.Assert(client.CreateApp(app), c.IsNil) // copy release from default postgres app release, err := client.GetAppRelease("postgres") t.Assert(err, c.IsNil) release.ID = "" proc := release.Processes["postgres"] delete(proc.Env, "SINGLETON") proc.Env["FLYNN_POSTGRES"] = d.name proc.Service = d.name release.Processes["postgres"] = proc t.Assert(client.CreateRelease(release), c.IsNil) t.Assert(client.SetAppRelease(app.ID, release.ID), c.IsNil) oldRelease := release.ID // create formation discEvents := make(chan *discoverd.Event) discStream, err := s.discoverdClient(t).Service(d.name).Watch(discEvents) t.Assert(err, c.IsNil) defer discStream.Close() jobEvents := make(chan *ct.Job) jobStream, err := client.StreamJobEvents(d.name, jobEvents) t.Assert(err, c.IsNil) defer jobStream.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"postgres": d.pgJobs, "web": d.webJobs}, }), c.IsNil) // watch cluster state changes type stateChange struct { state *state.State err error } stateCh := make(chan stateChange) go func() { for event := range discEvents { if event.Kind != discoverd.EventKindServiceMeta { continue } var state state.State if err := json.Unmarshal(event.ServiceMeta.Data, &state); err != nil { stateCh <- stateChange{err: err} return } primary := "" if state.Primary != nil { primary = state.Primary.Addr } sync := "" if state.Sync != nil { sync = state.Sync.Addr } var async []string for _, a := range state.Async { async = append(async, a.Addr) } debugf(t, "got pg cluster state: index=%d primary=%s sync=%s async=%s", event.ServiceMeta.Index, primary, sync, strings.Join(async, ",")) stateCh <- stateChange{state: &state} } }() // wait for correct cluster state and number of web processes var pgState state.State var webJobs int ready := func() bool { if webJobs != d.webJobs { return false } if pgState.Primary == nil { return false } if d.pgJobs > 1 && pgState.Sync == nil { return false } if d.pgJobs > 2 && len(pgState.Async) != d.pgJobs-2 { return false } return true } for { if ready() { break } select { case s := <-stateCh: t.Assert(s.err, c.IsNil) pgState = *s.state case e, ok := <-jobEvents: if !ok { t.Fatalf("job event stream closed: %s", jobStream.Err()) } debugf(t, "got job event: %s %s %s", e.Type, e.ID, e.State) if e.Type == "web" && e.State == "up" { webJobs++ } case <-time.After(30 * time.Second): t.Fatal("timed out waiting for postgres formation") } } // connect to the db so we can test writes db := postgres.Wait(d.name, fmt.Sprintf("dbname=postgres user=flynn password=%s", release.Env["PGPASSWORD"])) dbname := "deploy-test" t.Assert(db.Exec(fmt.Sprintf(`CREATE DATABASE "%s" WITH OWNER = "flynn"`, dbname)), c.IsNil) db.Close() db, err = postgres.Open(d.name, fmt.Sprintf("dbname=%s user=flynn password=%s", dbname, release.Env["PGPASSWORD"])) t.Assert(err, c.IsNil) defer db.Close() t.Assert(db.Exec(`CREATE TABLE deploy_test ( data text)`), c.IsNil) assertWriteable := func() { debug(t, "writing to postgres database") t.Assert(db.Exec(`INSERT INTO deploy_test (data) VALUES ('data')`), c.IsNil) } // check currently writeable assertWriteable() // check a deploy completes with expected cluster state changes release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) newRelease := release.ID deployment, err := client.CreateDeployment(app.ID, newRelease) t.Assert(err, c.IsNil) deployEvents := make(chan *ct.DeploymentEvent) deployStream, err := client.StreamDeployment(deployment, deployEvents) t.Assert(err, c.IsNil) defer deployStream.Close() // assertNextState checks that the next state received is in the remaining states // that were expected, so handles the fact that some states don't happen, but the // states that do happen are expected and in-order. assertNextState := func(remaining []expectedPgState) int { var state state.State loop: for { select { case s := <-stateCh: t.Assert(s.err, c.IsNil) if len(s.state.Async) < d.expectedAsyncs() { // we shouldn't usually receive states with less asyncs than // expected, but they can occur as an intermediate state between // two expected states (e.g. when a sync does a takeover at the // same time as a new async is started) so just ignore them. debug(t, "ignoring state with too few asyncs") continue } state = *s.state break loop case <-time.After(60 * time.Second): t.Fatal("timed out waiting for postgres cluster state") } } if state.Primary == nil { t.Fatal("no primary configured") } log := func(format string, v ...interface{}) { debugf(t, "skipping expected state: %s", fmt.Sprintf(format, v...)) } outer: for i, expected := range remaining { if state.Primary.Meta["FLYNN_RELEASE_ID"] != expected.Primary { log("primary has incorrect release") continue } if state.Sync == nil { if expected.Sync == "" { return i } log("state has no sync node") continue } if state.Sync.Meta["FLYNN_RELEASE_ID"] != expected.Sync { log("sync has incorrect release") continue } if state.Async == nil { if expected.Async == nil { return i } log("state has no async nodes") continue } if len(state.Async) != len(expected.Async) { log("expected %d asyncs, got %d", len(expected.Async), len(state.Async)) continue } for i, release := range expected.Async { if state.Async[i].Meta["FLYNN_RELEASE_ID"] != release { log("async[%d] has incorrect release", i) continue outer } } return i } t.Fatal("unexpected pg state") return -1 } expected := d.expected(oldRelease, newRelease) var expectedIndex, newWebJobs int loop: for { select { case e, ok := <-deployEvents: if !ok { t.Fatal("unexpected close of deployment event stream") } switch e.Status { case "complete": break loop case "failed": t.Fatalf("deployment failed: %s", e.Error) } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) if e.JobState != "up" && e.JobState != "down" { continue } switch e.JobType { case "postgres": // move on if we have seen all the expected events if expectedIndex >= len(expected) { continue } skipped := assertNextState(expected[expectedIndex:]) expectedIndex += 1 + skipped case "web": if e.JobState == "up" && e.ReleaseID == newRelease { newWebJobs++ } } case <-time.After(2 * time.Minute): t.Fatal("timed out waiting for deployment") } } // check we have the correct number of new web jobs t.Assert(newWebJobs, c.Equals, d.webJobs) // check writeable now deploy is complete assertWriteable() }