func (s *DeployerSuite) TestRollbackNoService(t *c.C) { // create a running release app, release := s.createRelease(t, "printer", "all-at-once") // deploy a release which will not register the service client := s.controllerClient(t) release.ID = "" printer := release.Processes["printer"] printer.Service = "printer" printer.Ports = []ct.Port{{ Port: 12345, Proto: "tcp", Service: &host.Service{ Name: "printer", Create: true, Check: &host.HealthCheck{ Type: "tcp", Interval: 100 * time.Millisecond, Threshold: 1, KillDown: true, StartTimeout: 100 * time.Millisecond, }, }, }} release.Processes["printer"] = printer t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) // check the deployment fails events := make(chan *ct.DeploymentEvent) stream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer stream.Close() expected := []*ct.DeploymentEvent{ {ReleaseID: release.ID, JobType: "", JobState: "", Status: "pending"}, {ReleaseID: release.ID, JobType: "printer", JobState: ct.JobStateStarting, Status: "running"}, {ReleaseID: release.ID, JobType: "printer", JobState: ct.JobStateStarting, Status: "running"}, {ReleaseID: release.ID, JobType: "printer", JobState: ct.JobStateDown, Status: "running"}, {ReleaseID: release.ID, JobType: "", JobState: "", Status: "failed", Error: "printer process type failed to start, got down job event"}, } waitForDeploymentEvents(t, events, expected) s.assertRolledBack(t, deployment, map[string]int{"printer": 2}) // check a new deployment can be created _, err = client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) }
func (s *DeployerSuite) TestRollbackFailedJob(t *c.C) { // create a running release app, release := s.createRelease(t, "printer", "all-at-once") // deploy a release which will fail to start client := s.controllerClient(t) release.ID = "" printer := release.Processes["printer"] printer.Args = []string{"this-is-gonna-fail"} release.Processes["printer"] = printer t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) // check the deployment fails events := make(chan *ct.DeploymentEvent) stream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer stream.Close() expected := []*ct.DeploymentEvent{ {ReleaseID: release.ID, JobType: "", JobState: "", Status: "pending"}, {ReleaseID: release.ID, JobType: "printer", JobState: ct.JobStateStarting, Status: "running"}, {ReleaseID: release.ID, JobType: "printer", JobState: ct.JobStateStarting, Status: "running"}, {ReleaseID: release.ID, JobType: "printer", JobState: ct.JobStateDown, Status: "running"}, {ReleaseID: release.ID, JobType: "", JobState: "", Status: "failed", Error: `deployer: printer job failed to start: exec: "this-is-gonna-fail": executable file not found in $PATH`}, } waitForDeploymentEvents(t, events, expected) s.assertRolledBack(t, deployment, map[string]int{"printer": 2}) }
func (s *ZDiscoverdSuite) TestDeploy(t *c.C) { // ensure we have enough hosts in the cluster hosts, err := s.clusterClient(t).Hosts() t.Assert(err, c.IsNil) if len(hosts) <= 1 { t.Skip("cannot deploy discoverd in a single node cluster") } client := s.controllerClient(t) app, err := client.GetApp("discoverd") t.Assert(err, c.IsNil) release, err := client.GetAppRelease(app.ID) t.Assert(err, c.IsNil) release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events := make(chan *ct.DeploymentEvent) stream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer stream.Close() loop: for { select { case event, ok := <-events: if !ok { t.Fatal("unexpected close of deployment event stream") } if event.Status == "complete" { debugf(t, "got deployment event: %s", event.Status) break loop } if event.Status == "failed" { t.Fatal("the deployment failed") } debugf(t, "got deployment event: %s %s", event.JobType, event.JobState) case <-time.After(time.Duration(app.DeployTimeout) * time.Second): t.Fatal("timed out waiting for deployment event") } } }
func (s *DeployerSuite) createDeployment(t *c.C, process, strategy, service string) *testDeploy { app, release := s.createRelease(t, process, strategy) if service != "" { debugf(t, "waiting for 2 %s services", service) events := make(chan *discoverd.Event) stream, err := s.discoverdClient(t).Service(service).Watch(events) t.Assert(err, c.IsNil) defer stream.Close() count := 0 loop: for { select { case event, ok := <-events: if !ok { t.Fatalf("service discovery stream closed unexpectedly") } if event.Kind == discoverd.EventKindUp { if id, ok := event.Instance.Meta["FLYNN_RELEASE_ID"]; !ok || id != release.ID { continue } debugf(t, "got %s service up event", service) count++ } if count == 2 { // although the services are up, give them a few more seconds // to make sure the deployer will also see them as up. time.Sleep(5 * time.Second) break loop } case <-time.After(10 * time.Second): t.Fatalf("timed out waiting for %s service to come up", service) } } } client := s.controllerClient(t) jobEvents := make(chan *ct.Job) jobStream, err := client.StreamJobEvents(app.ID, jobEvents) t.Assert(err, c.IsNil) // create a new release for the deployment release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) debugf(t, "created deployment %s", deployment.ID) debugf(t, "deploying from release %s to %s", deployment.OldReleaseID, deployment.NewReleaseID) deployEvents := make(chan *ct.DeploymentEvent) deployStream, err := client.StreamDeployment(deployment, deployEvents) t.Assert(err, c.IsNil) return &testDeploy{ s: s, t: t, deployment: deployment, deployEvents: deployEvents, deployStream: deployStream, jobEvents: jobEvents, jobStream: jobStream, } }
func (s *DeployerSuite) TestOmniProcess(t *c.C) { if testCluster == nil { t.Skip("cannot determine test cluster size") } // create and scale an omni release omniScale := 2 totalJobs := omniScale * testCluster.Size() client := s.controllerClient(t) app, release := s.createApp(t) watcher, err := client.WatchJobEvents(app.Name, release.ID) t.Assert(err, c.IsNil) defer watcher.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"omni": omniScale}, }), c.IsNil) err = watcher.WaitFor(ct.JobEvents{"omni": {ct.JobStateUp: totalJobs}}, scaleTimeout, nil) t.Assert(err, c.IsNil) // deploy using all-at-once and check we get the correct events app.Strategy = "all-at-once" t.Assert(client.UpdateApp(app), c.IsNil) release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events := make(chan *ct.DeploymentEvent) stream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer stream.Close() expected := make([]*ct.Job, 0, 3*totalJobs+1) appendEvents := func(releaseID string, state ct.JobState, count int) { for i := 0; i < count; i++ { expected = append(expected, &ct.Job{ ReleaseID: releaseID, Type: "omni", State: state, }) } } appendEvents(deployment.NewReleaseID, ct.JobStateUp, totalJobs) appendEvents(deployment.OldReleaseID, ct.JobStateDown, totalJobs) s.waitForDeploymentStatus(t, events, "complete") // deploy using one-by-one and check we get the correct events app.Strategy = "one-by-one" t.Assert(client.UpdateApp(app), c.IsNil) release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) // try creating the deployment multiple times to avoid getting a // "Cannot create deploy, one is already in progress" error (there // is no guarantee the previous deploy has finished yet) attempts := attempt.Strategy{Total: 10 * time.Second, Delay: 100 * time.Millisecond} err = attempts.Run(func() (err error) { deployment, err = client.CreateDeployment(app.ID, release.ID) return }) t.Assert(err, c.IsNil) events = make(chan *ct.DeploymentEvent) stream, err = client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) expected = make([]*ct.Job, 0, 4*totalJobs+1) appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size()) appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size()) appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size()) appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size()) s.waitForDeploymentStatus(t, events, "complete") }
func (s *SchedulerSuite) TestDeployController(t *c.C) { // get the current controller release client := s.controllerClient(t) app, err := client.GetApp("controller") t.Assert(err, c.IsNil) release, err := client.GetAppRelease(app.ID) t.Assert(err, c.IsNil) // get the current controller formation formation, err := client.GetFormation(app.ID, release.ID) t.Assert(err, c.IsNil) // create a controller deployment release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events := make(chan *ct.DeploymentEvent) eventStream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer eventStream.Close() // wait for the deploy to complete (this doesn't wait for specific events // due to the fact that when the deployer deploys itself, some events will // not get sent) loop: for { select { case e, ok := <-events: if !ok { t.Fatal("unexpected close of deployment event stream") } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) switch e.Status { case "complete": break loop case "failed": t.Fatal("the deployment failed") } case <-time.After(time.Duration(app.DeployTimeout) * time.Second): t.Fatal("timed out waiting for the deploy to complete") } } // check the correct controller jobs are running hosts, err := s.clusterClient(t).Hosts() t.Assert(err, c.IsNil) t.Assert(hosts, c.Not(c.HasLen), 0) actual := make(map[string]map[string]int) for _, h := range hosts { jobs, err := h.ListJobs() t.Assert(err, c.IsNil) for _, job := range jobs { if job.Status != host.StatusRunning { continue } appID := job.Job.Metadata["flynn-controller.app"] if appID != app.ID { continue } releaseID := job.Job.Metadata["flynn-controller.release"] if _, ok := actual[releaseID]; !ok { actual[releaseID] = make(map[string]int) } typ := job.Job.Metadata["flynn-controller.type"] actual[releaseID][typ]++ } } expected := map[string]map[string]int{release.ID: { "web": formation.Processes["web"], "worker": formation.Processes["worker"], "scheduler": len(hosts), }} t.Assert(actual, c.DeepEquals, expected) }
func (s *SchedulerSuite) TestRollbackController(t *c.C) { // get the current controller release client := s.controllerClient(t) app, err := client.GetApp("controller") t.Assert(err, c.IsNil) release, err := client.GetAppRelease(app.ID) t.Assert(err, c.IsNil) watcher, err := s.controllerClient(t).WatchJobEvents(app.ID, release.ID) t.Assert(err, c.IsNil) defer watcher.Close() // get the current controller formation formation, err := client.GetFormation(app.ID, release.ID) t.Assert(err, c.IsNil) currentReleaseID := release.ID // create a controller deployment that will fail release.ID = "" worker := release.Processes["worker"] worker.Entrypoint = []string{"/i/dont/exist"} release.Processes["worker"] = worker t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events := make(chan *ct.DeploymentEvent) eventStream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer eventStream.Close() // wait for the deploy to fail loop: for { select { case e, ok := <-events: if !ok { t.Fatal("unexpected close of deployment event stream") } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) switch e.Status { case "complete": t.Fatal("the deployment succeeded when it should have failed") case "failed": break loop } case <-time.After(2 * time.Minute): t.Fatal("timed out waiting for the deploy to fail") } } // wait for jobs to come back up hosts, err := s.clusterClient(t).Hosts() expected := map[string]map[ct.JobState]int{ "web": {ct.JobStateUp: formation.Processes["web"]}, "scheduler": {ct.JobStateUp: len(hosts)}, } t.Assert(watcher.WaitFor(expected, scaleTimeout, nil), c.IsNil) // check the correct controller jobs are running t.Assert(err, c.IsNil) t.Assert(hosts, c.Not(c.HasLen), 0) actual := make(map[string]map[string]int) for _, h := range hosts { jobs, err := h.ListJobs() t.Assert(err, c.IsNil) for _, job := range jobs { if job.Status != host.StatusRunning { continue } appID := job.Job.Metadata["flynn-controller.app"] if appID != app.ID { continue } releaseID := job.Job.Metadata["flynn-controller.release"] if releaseID != currentReleaseID { continue } if _, ok := actual[releaseID]; !ok { actual[releaseID] = make(map[string]int) } typ := job.Job.Metadata["flynn-controller.type"] actual[releaseID][typ]++ } } t.Assert(actual, c.DeepEquals, map[string]map[string]int{ currentReleaseID: { "web": formation.Processes["web"], "scheduler": formation.Processes["scheduler"] * len(hosts), "worker": formation.Processes["worker"], }, }) }
func (s *DeployerSuite) TestOmniProcess(t *c.C) { if testCluster == nil { t.Skip("cannot determine test cluster size") } // create and scale an omni release omniScale := 2 totalJobs := omniScale * testCluster.Size() client := s.controllerClient(t) app, release := s.createApp(t) watcher, err := client.WatchJobEvents(app.Name, release.ID) t.Assert(err, c.IsNil) defer watcher.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"omni": omniScale}, }), c.IsNil) err = watcher.WaitFor(ct.JobEvents{"omni": {ct.JobStateUp: totalJobs}}, scaleTimeout, nil) t.Assert(err, c.IsNil) // deploy using all-at-once and check we get the correct events app.Strategy = "all-at-once" t.Assert(client.UpdateApp(app), c.IsNil) release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events := make(chan *ct.DeploymentEvent) stream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer stream.Close() expected := make([]*ct.DeploymentEvent, 0, 4*totalJobs+1) appendEvents := func(releaseID string, state ct.JobState, count int) { for i := 0; i < count; i++ { event := &ct.DeploymentEvent{ ReleaseID: releaseID, JobType: "omni", JobState: state, Status: "running", } expected = append(expected, event) } } expected = append(expected, &ct.DeploymentEvent{ReleaseID: deployment.NewReleaseID, Status: "pending"}) appendEvents(deployment.NewReleaseID, ct.JobStateStarting, totalJobs) appendEvents(deployment.NewReleaseID, ct.JobStateUp, totalJobs) appendEvents(deployment.OldReleaseID, ct.JobStateStopping, totalJobs) appendEvents(deployment.OldReleaseID, ct.JobStateDown, totalJobs) expected = append(expected, &ct.DeploymentEvent{ReleaseID: deployment.NewReleaseID, Status: "complete"}) waitForDeploymentEvents(t, events, expected) // deploy using one-by-one and check we get the correct events app.Strategy = "one-by-one" t.Assert(client.UpdateApp(app), c.IsNil) release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err = client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events = make(chan *ct.DeploymentEvent) stream, err = client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) expected = make([]*ct.DeploymentEvent, 0, 4*totalJobs+1) expected = append(expected, &ct.DeploymentEvent{ReleaseID: deployment.NewReleaseID, Status: "pending"}) appendEvents(deployment.NewReleaseID, ct.JobStateStarting, testCluster.Size()) appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size()) appendEvents(deployment.OldReleaseID, ct.JobStateStopping, testCluster.Size()) appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size()) appendEvents(deployment.NewReleaseID, ct.JobStateStarting, testCluster.Size()) appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size()) appendEvents(deployment.OldReleaseID, ct.JobStateStopping, testCluster.Size()) appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size()) expected = append(expected, &ct.DeploymentEvent{ReleaseID: deployment.NewReleaseID, Status: "complete"}) waitForDeploymentEvents(t, events, expected) }
func (s *PostgresSuite) testDeploy(t *c.C, d *pgDeploy) { // create postgres app client := s.controllerClient(t) app := &ct.App{Name: d.name, Strategy: "postgres"} t.Assert(client.CreateApp(app), c.IsNil) // copy release from default postgres app release, err := client.GetAppRelease("postgres") t.Assert(err, c.IsNil) release.ID = "" proc := release.Processes["postgres"] delete(proc.Env, "SINGLETON") proc.Env["FLYNN_POSTGRES"] = d.name proc.Service = d.name release.Processes["postgres"] = proc t.Assert(client.CreateRelease(release), c.IsNil) t.Assert(client.SetAppRelease(app.ID, release.ID), c.IsNil) oldRelease := release.ID // create formation discEvents := make(chan *discoverd.Event) discStream, err := s.discoverdClient(t).Service(d.name).Watch(discEvents) t.Assert(err, c.IsNil) defer discStream.Close() jobEvents := make(chan *ct.Job) jobStream, err := client.StreamJobEvents(d.name, jobEvents) t.Assert(err, c.IsNil) defer jobStream.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"postgres": d.pgJobs, "web": d.webJobs}, }), c.IsNil) // watch cluster state changes type stateChange struct { state *state.State err error } stateCh := make(chan stateChange) go func() { for event := range discEvents { if event.Kind != discoverd.EventKindServiceMeta { continue } var state state.State if err := json.Unmarshal(event.ServiceMeta.Data, &state); err != nil { stateCh <- stateChange{err: err} return } primary := "" if state.Primary != nil { primary = state.Primary.Addr } sync := "" if state.Sync != nil { sync = state.Sync.Addr } var async []string for _, a := range state.Async { async = append(async, a.Addr) } debugf(t, "got pg cluster state: index=%d primary=%s sync=%s async=%s", event.ServiceMeta.Index, primary, sync, strings.Join(async, ",")) stateCh <- stateChange{state: &state} } }() // wait for correct cluster state and number of web processes var pgState state.State var webJobs int ready := func() bool { if webJobs != d.webJobs { return false } if pgState.Primary == nil { return false } if d.pgJobs > 1 && pgState.Sync == nil { return false } if d.pgJobs > 2 && len(pgState.Async) != d.pgJobs-2 { return false } return true } for { if ready() { break } select { case s := <-stateCh: t.Assert(s.err, c.IsNil) pgState = *s.state case e, ok := <-jobEvents: if !ok { t.Fatalf("job event stream closed: %s", jobStream.Err()) } debugf(t, "got job event: %s %s %s", e.Type, e.ID, e.State) if e.Type == "web" && e.State == "up" { webJobs++ } case <-time.After(30 * time.Second): t.Fatal("timed out waiting for postgres formation") } } // connect to the db so we can test writes db := postgres.Wait(d.name, fmt.Sprintf("dbname=postgres user=flynn password=%s", release.Env["PGPASSWORD"])) dbname := "deploy-test" t.Assert(db.Exec(fmt.Sprintf(`CREATE DATABASE "%s" WITH OWNER = "flynn"`, dbname)), c.IsNil) db.Close() db, err = postgres.Open(d.name, fmt.Sprintf("dbname=%s user=flynn password=%s", dbname, release.Env["PGPASSWORD"])) t.Assert(err, c.IsNil) defer db.Close() t.Assert(db.Exec(`CREATE TABLE deploy_test ( data text)`), c.IsNil) assertWriteable := func() { debug(t, "writing to postgres database") t.Assert(db.Exec(`INSERT INTO deploy_test (data) VALUES ('data')`), c.IsNil) } // check currently writeable assertWriteable() // check a deploy completes with expected cluster state changes release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) newRelease := release.ID deployment, err := client.CreateDeployment(app.ID, newRelease) t.Assert(err, c.IsNil) deployEvents := make(chan *ct.DeploymentEvent) deployStream, err := client.StreamDeployment(deployment, deployEvents) t.Assert(err, c.IsNil) defer deployStream.Close() // assertNextState checks that the next state received is in the remaining states // that were expected, so handles the fact that some states don't happen, but the // states that do happen are expected and in-order. assertNextState := func(remaining []expectedPgState) int { var state state.State loop: for { select { case s := <-stateCh: t.Assert(s.err, c.IsNil) if len(s.state.Async) < d.expectedAsyncs() { // we shouldn't usually receive states with less asyncs than // expected, but they can occur as an intermediate state between // two expected states (e.g. when a sync does a takeover at the // same time as a new async is started) so just ignore them. debug(t, "ignoring state with too few asyncs") continue } state = *s.state break loop case <-time.After(60 * time.Second): t.Fatal("timed out waiting for postgres cluster state") } } if state.Primary == nil { t.Fatal("no primary configured") } log := func(format string, v ...interface{}) { debugf(t, "skipping expected state: %s", fmt.Sprintf(format, v...)) } outer: for i, expected := range remaining { if state.Primary.Meta["FLYNN_RELEASE_ID"] != expected.Primary { log("primary has incorrect release") continue } if state.Sync == nil { if expected.Sync == "" { return i } log("state has no sync node") continue } if state.Sync.Meta["FLYNN_RELEASE_ID"] != expected.Sync { log("sync has incorrect release") continue } if state.Async == nil { if expected.Async == nil { return i } log("state has no async nodes") continue } if len(state.Async) != len(expected.Async) { log("expected %d asyncs, got %d", len(expected.Async), len(state.Async)) continue } for i, release := range expected.Async { if state.Async[i].Meta["FLYNN_RELEASE_ID"] != release { log("async[%d] has incorrect release", i) continue outer } } return i } t.Fatal("unexpected pg state") return -1 } expected := d.expected(oldRelease, newRelease) var expectedIndex, newWebJobs int loop: for { select { case e, ok := <-deployEvents: if !ok { t.Fatal("unexpected close of deployment event stream") } switch e.Status { case "complete": break loop case "failed": t.Fatalf("deployment failed: %s", e.Error) } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) if e.JobState != "up" && e.JobState != "down" { continue } switch e.JobType { case "postgres": // move on if we have seen all the expected events if expectedIndex >= len(expected) { continue } skipped := assertNextState(expected[expectedIndex:]) expectedIndex += 1 + skipped case "web": if e.JobState == "up" && e.ReleaseID == newRelease { newWebJobs++ } } case <-time.After(2 * time.Minute): t.Fatal("timed out waiting for deployment") } } // check we have the correct number of new web jobs t.Assert(newWebJobs, c.Equals, d.webJobs) // check writeable now deploy is complete assertWriteable() }