func (c *context) rollback(l log15.Logger, deployment *ct.Deployment, original *ct.Formation) error { log := l.New("fn", "rollback") log.Info("creating job watcher") jobWatcher, err := c.client.WatchJobEvents(deployment.AppID, deployment.OldReleaseID) if err != nil { log.Error("error opening job event stream", "err", err) return err } appJobs, err := c.client.JobList(deployment.AppID) if err != nil { log.Error("error listing app jobs", "err", err) return err } runningJobs := make(map[string]int) for _, j := range appJobs { if j.ReleaseID != deployment.OldReleaseID { continue } if j.State == ct.JobStateUp { runningJobs[j.Type]++ } } expectedJobEvents := make(ct.JobEvents, len(original.Processes)) for name, count := range original.Processes { count = count - runningJobs[name] if count > 0 { expectedJobEvents[name] = ct.JobUpEvents(count) } } log.Info("restoring the original formation", "release.id", original.ReleaseID) if err := c.client.PutFormation(original); err != nil { log.Error("error restoring the original formation", "err", err) return err } if len(expectedJobEvents) > 0 { log.Info("waiting for job events", "events", expectedJobEvents) callback := func(job *ct.Job) error { log.Info("got job event", "job.id", job.ID, "job.type", job.Type, "job.state", job.State) return nil } if err := jobWatcher.WaitFor(expectedJobEvents, 10*time.Second, callback); err != nil { log.Error("error waiting for job events", "err", err) } } log.Info("deleting the new formation") if err := c.client.DeleteFormation(deployment.AppID, deployment.NewReleaseID); err != nil { log.Error("error deleting the new formation:", "err", err) return err } log.Info("rollback complete") return nil }
func (c *Client) ExpectedScalingEvents(actual, expected map[string]int, releaseProcesses map[string]ct.ProcessType, clusterSize int) ct.JobEvents { events := make(ct.JobEvents, len(expected)) for typ, count := range expected { diff := count val, ok := actual[typ] if ok { diff = count - val } proc, ok := releaseProcesses[typ] if ok && proc.Omni { diff *= clusterSize } if diff > 0 { events[typ] = ct.JobUpEvents(diff) } else if diff < 0 { events[typ] = ct.JobDownEvents(-diff) } } return events }
func (s *CLISuite) TestSlugReleaseGarbageCollection(t *c.C) { client := s.controllerClient(t) // create app with gc.max_inactive_slug_releases=3 maxInactiveSlugReleases := 3 app := &ct.App{Meta: map[string]string{"gc.max_inactive_slug_releases": strconv.Itoa(maxInactiveSlugReleases)}} t.Assert(client.CreateApp(app), c.IsNil) // create an image artifact imageArtifact := s.createArtifact(t, "test-apps") // create 5 slug artifacts tmp, err := ioutil.TempFile("", "squashfs-") t.Assert(err, c.IsNil) defer os.Remove(tmp.Name()) defer tmp.Close() t.Assert(exec.Command("mksquashfs", t.MkDir(), tmp.Name(), "-noappend").Run(), c.IsNil) slug, err := ioutil.ReadAll(tmp) t.Assert(err, c.IsNil) slugHash := sha512.Sum512(slug) slugs := []string{ "http://blobstore.discoverd/layer/1.squashfs", "http://blobstore.discoverd/layer/2.squashfs", "http://blobstore.discoverd/layer/3.squashfs", "http://blobstore.discoverd/layer/4.squashfs", "http://blobstore.discoverd/layer/5.squashfs", } slugArtifacts := make([]*ct.Artifact, len(slugs)) put := func(url string, data []byte) { req, err := http.NewRequest("PUT", url, bytes.NewReader(data)) t.Assert(err, c.IsNil) res, err := http.DefaultClient.Do(req) t.Assert(err, c.IsNil) res.Body.Close() t.Assert(res.StatusCode, c.Equals, http.StatusOK) } for i, layerURL := range slugs { manifest := &ct.ImageManifest{ Type: ct.ImageManifestTypeV1, Rootfs: []*ct.ImageRootfs{{ Layers: []*ct.ImageLayer{{ ID: strconv.Itoa(i + 1), Type: ct.ImageLayerTypeSquashfs, Length: int64(len(slug)), Hashes: map[string]string{"sha512": hex.EncodeToString(slugHash[:])}, }}, }}, } data := manifest.RawManifest() url := fmt.Sprintf("http://blobstore.discoverd/image/%s.json", manifest.ID()) put(url, data) put(layerURL, slug) artifact := &ct.Artifact{ Type: ct.ArtifactTypeFlynn, URI: url, Meta: map[string]string{"blobstore": "true"}, RawManifest: data, Hashes: manifest.Hashes(), Size: int64(len(data)), LayerURLTemplate: "http://blobstore.discoverd/layer/{id}.squashfs", } t.Assert(client.CreateArtifact(artifact), c.IsNil) slugArtifacts[i] = artifact } // create 6 releases, the second being scaled up and having the // same slug as the third (so prevents the slug being deleted) releases := make([]*ct.Release, 6) for i, r := range []struct { slug *ct.Artifact active bool }{ {slugArtifacts[0], false}, {slugArtifacts[1], true}, {slugArtifacts[1], false}, {slugArtifacts[2], false}, {slugArtifacts[3], false}, {slugArtifacts[4], false}, } { release := &ct.Release{ ArtifactIDs: []string{imageArtifact.ID, r.slug.ID}, Processes: map[string]ct.ProcessType{ "app": {Args: []string{"/bin/pingserv"}, Ports: []ct.Port{{Proto: "tcp"}}}, }, Meta: map[string]string{"git": "true"}, } t.Assert(client.CreateRelease(release), c.IsNil) procs := map[string]int{"app": 0} if r.active { procs["app"] = 1 } t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: procs, }), c.IsNil) releases[i] = release } // scale the last release so we can deploy it lastRelease := releases[len(releases)-1] watcher, err := client.WatchJobEvents(app.ID, lastRelease.ID) t.Assert(err, c.IsNil) defer watcher.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: lastRelease.ID, Processes: map[string]int{"app": 1}, }), c.IsNil) t.Assert(watcher.WaitFor(ct.JobEvents{"app": ct.JobUpEvents(1)}, scaleTimeout, nil), c.IsNil) t.Assert(client.SetAppRelease(app.ID, lastRelease.ID), c.IsNil) // subscribe to garbage collection events gcEvents := make(chan *ct.Event) stream, err := client.StreamEvents(ct.StreamEventsOptions{ AppID: app.ID, ObjectTypes: []ct.EventType{ct.EventTypeAppGarbageCollection}, }, gcEvents) t.Assert(err, c.IsNil) defer stream.Close() // deploy a new release with the same slug as the last release timeoutCh := make(chan struct{}) time.AfterFunc(5*time.Minute, func() { close(timeoutCh) }) newRelease := *lastRelease newRelease.ID = "" t.Assert(client.CreateRelease(&newRelease), c.IsNil) t.Assert(client.DeployAppRelease(app.ID, newRelease.ID, timeoutCh), c.IsNil) // wait for garbage collection select { case event, ok := <-gcEvents: if !ok { t.Fatalf("event stream closed unexpectedly: %s", stream.Err()) } var e ct.AppGarbageCollectionEvent t.Assert(json.Unmarshal(event.Data, &e), c.IsNil) if e.Error != "" { t.Fatalf("garbage collection failed: %s", e.Error) } case <-time.After(60 * time.Second): t.Fatal("timed out waiting for garbage collection") } // check we have 4 distinct slug releases (so 5 in total, only 3 are // inactive) list, err := client.AppReleaseList(app.ID) t.Assert(err, c.IsNil) t.Assert(list, c.HasLen, maxInactiveSlugReleases+2) distinctSlugs := make(map[string]struct{}, len(list)) for _, release := range list { t.Assert(release.ArtifactIDs, c.HasLen, 2) distinctSlugs[release.ArtifactIDs[1]] = struct{}{} } t.Assert(distinctSlugs, c.HasLen, maxInactiveSlugReleases+1) // check the first and third releases got deleted, but the rest remain assertDeleted := func(release *ct.Release, deleted bool) { _, err := client.GetRelease(release.ID) if deleted { t.Assert(err, c.Equals, controller.ErrNotFound) } else { t.Assert(err, c.IsNil) } } assertDeleted(releases[0], true) assertDeleted(releases[1], false) assertDeleted(releases[2], true) assertDeleted(releases[3], false) assertDeleted(releases[4], false) assertDeleted(releases[5], false) assertDeleted(&newRelease, false) // check the first slug got deleted, but the rest remain s.assertURI(t, slugs[0], http.StatusNotFound) for i := 1; i < len(slugs); i++ { s.assertURI(t, slugs[i], http.StatusOK) } }
func (d *DeployJob) deployAllAtOnce() error { log := d.logger.New("fn", "deployAllAtOnce") log.Info("starting all-at-once deployment") expected := make(ct.JobEvents) newProcs := make(map[string]int, len(d.Processes)) for typ, n := range d.Processes { // ignore processes which no longer exist in the new // release if _, ok := d.newRelease.Processes[typ]; !ok { continue } newProcs[typ] = n total := n if d.isOmni(typ) { total *= d.hostCount } existing := d.newReleaseState[typ] for i := existing; i < total; i++ { d.deployEvents <- ct.DeploymentEvent{ ReleaseID: d.NewReleaseID, JobState: ct.JobStateStarting, JobType: typ, } } if total > existing { expected[typ] = ct.JobUpEvents(total - existing) } } if expected.Count() > 0 { log := log.New("release_id", d.NewReleaseID) log.Info("creating new formation", "processes", newProcs) if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.NewReleaseID, Processes: newProcs, }); err != nil { log.Error("error creating new formation", "err", err) return err } log.Info("waiting for job events", "expected", expected) if err := d.waitForJobEvents(d.NewReleaseID, expected, log); err != nil { log.Error("error waiting for job events", "err", err) return err } } expected = make(ct.JobEvents) for typ := range d.Processes { existing := d.oldReleaseState[typ] for i := 0; i < existing; i++ { d.deployEvents <- ct.DeploymentEvent{ ReleaseID: d.OldReleaseID, JobState: ct.JobStateStopping, JobType: typ, } } if existing > 0 { expected[typ] = ct.JobDownEvents(existing) } } // the new jobs have now started and they are up, so return // ErrSkipRollback from here on out if an error occurs (rolling // back doesn't make a ton of sense because it involves // stopping the new working jobs). log = log.New("release_id", d.OldReleaseID) log.Info("scaling old formation to zero") if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.OldReleaseID, }); err != nil { log.Error("error scaling old formation to zero", "err", err) return ErrSkipRollback{err.Error()} } if expected.Count() > 0 { log.Info("waiting for job events", "expected", expected) if err := d.waitForJobEvents(d.OldReleaseID, expected, log); err != nil { log.Error("error waiting for job events", "err", err) return ErrSkipRollback{err.Error()} } } log.Info("finished all-at-once deployment") return nil }
func (s *SchedulerSuite) TestScaleTags(t *c.C) { // ensure we have more than 1 host to test with hosts, err := s.clusterClient(t).Hosts() t.Assert(err, c.IsNil) if len(hosts) <= 1 { t.Skip("not enough hosts to test tagged based scheduling") } // watch service events so we can wait for tag changes events := make(chan *discoverd.Event) stream, err := s.discoverdClient(t).Service("flynn-host").Watch(events) t.Assert(err, c.IsNil) defer stream.Close() waitServiceEvent := func(kind discoverd.EventKind) *discoverd.Event { for { select { case event, ok := <-events: if !ok { t.Fatalf("service event stream closed unexpectedly: %s", stream.Err()) } if event.Kind == kind { return event } case <-time.After(10 * time.Second): t.Fatalf("timed out waiting for service %s event", kind) } } } // wait for the watch to be current before changing tags waitServiceEvent(discoverd.EventKindCurrent) updateTags := func(host *cluster.Host, tags map[string]string) { debugf(t, "setting host tags: %s => %v", host.ID(), tags) t.Assert(host.UpdateTags(tags), c.IsNil) event := waitServiceEvent(discoverd.EventKindUpdate) t.Assert(event.Instance.Meta["id"], c.Equals, host.ID()) for key, val := range tags { t.Assert(event.Instance.Meta["tag:"+key], c.Equals, val) } } // create an app with a tagged process and watch job events app, release := s.createApp(t) formation := &ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Tags: map[string]map[string]string{"printer": {"active": "true"}}, } client := s.controllerClient(t) watcher, err := client.WatchJobEvents(app.ID, release.ID) t.Assert(err, c.IsNil) defer watcher.Close() // add tag to host 1 host1 := hosts[0] updateTags(host1, map[string]string{"active": "true"}) // start jobs debug(t, "scaling printer=2") formation.Processes = map[string]int{"printer": 2} t.Assert(client.PutFormation(formation), c.IsNil) t.Assert(watcher.WaitFor(ct.JobEvents{"printer": ct.JobUpEvents(2)}, scaleTimeout, nil), c.IsNil) assertHostJobCounts := func(expected map[string]int) { jobs, err := client.JobList(app.ID) t.Assert(err, c.IsNil) actual := make(map[string]int) for _, job := range jobs { if job.State == ct.JobStateUp { actual[job.HostID]++ } } t.Assert(actual, c.DeepEquals, expected) } // check all jobs on host 1 assertHostJobCounts(map[string]int{host1.ID(): 2}) // add tag to host 2 host2 := hosts[1] updateTags(host2, map[string]string{"active": "true"}) // scale up debug(t, "scaling printer=4") formation.Processes["printer"] = 4 t.Assert(client.PutFormation(formation), c.IsNil) t.Assert(watcher.WaitFor(ct.JobEvents{"printer": ct.JobUpEvents(2)}, scaleTimeout, nil), c.IsNil) // check jobs distributed across hosts 1 and 2 assertHostJobCounts(map[string]int{host1.ID(): 2, host2.ID(): 2}) // remove tag from host 2 updateTags(host2, map[string]string{"active": ""}) // check jobs are moved to host1 jobEvents := ct.JobEvents{"printer": map[ct.JobState]int{ ct.JobStateDown: 2, ct.JobStateUp: 2, }} t.Assert(watcher.WaitFor(jobEvents, scaleTimeout, nil), c.IsNil) assertHostJobCounts(map[string]int{host1.ID(): 4}) // remove tag from host 1 updateTags(host1, map[string]string{"active": ""}) assertStateCounts := func(expected map[ct.JobState]int) { jobs, err := client.JobList(app.ID) t.Assert(err, c.IsNil) actual := make(map[ct.JobState]int) for _, job := range jobs { actual[job.State]++ } t.Assert(actual, c.DeepEquals, expected) } // check 4 pending jobs, rest are stopped t.Assert(watcher.WaitFor(ct.JobEvents{"printer": ct.JobDownEvents(4)}, scaleTimeout, nil), c.IsNil) assertStateCounts(map[ct.JobState]int{ct.JobStatePending: 4, ct.JobStateDown: 6}) // re-add tag to host 1 updateTags(host1, map[string]string{"active": "true"}) // check pending jobs are started on host 1 t.Assert(watcher.WaitFor(ct.JobEvents{"printer": ct.JobUpEvents(4)}, scaleTimeout, nil), c.IsNil) assertHostJobCounts(map[string]int{host1.ID(): 4}) assertStateCounts(map[ct.JobState]int{ct.JobStateUp: 4, ct.JobStateDown: 6}) // add different tag to host 2 updateTags(host2, map[string]string{"disk": "ssd"}) // update formation tags, check jobs are moved to host 2 debug(t, "updating formation tags to disk=ssd") formation.Tags["printer"] = map[string]string{"disk": "ssd"} t.Assert(client.PutFormation(formation), c.IsNil) jobEvents = ct.JobEvents{"printer": map[ct.JobState]int{ ct.JobStateDown: 4, ct.JobStateUp: 4, }} t.Assert(watcher.WaitFor(jobEvents, scaleTimeout, nil), c.IsNil) assertHostJobCounts(map[string]int{host2.ID(): 4}) assertStateCounts(map[ct.JobState]int{ct.JobStateUp: 4, ct.JobStateDown: 10}) // scale down stops the jobs debug(t, "scaling printer=0") formation.Processes = nil t.Assert(client.PutFormation(formation), c.IsNil) t.Assert(watcher.WaitFor(ct.JobEvents{"printer": ct.JobDownEvents(4)}, scaleTimeout, nil), c.IsNil) assertStateCounts(map[ct.JobState]int{ct.JobStateDown: 14}) }
func (s *CLISuite) TestSlugReleaseGarbageCollection(t *c.C) { client := s.controllerClient(t) // create app with gc.max_inactive_slug_releases=3 maxInactiveSlugReleases := 3 app := &ct.App{Meta: map[string]string{"gc.max_inactive_slug_releases": strconv.Itoa(maxInactiveSlugReleases)}} t.Assert(client.CreateApp(app), c.IsNil) // create an image artifact imageArtifact := &ct.Artifact{Type: host.ArtifactTypeDocker, URI: imageURIs["test-apps"]} t.Assert(client.CreateArtifact(imageArtifact), c.IsNil) // create 5 slug artifacts var slug bytes.Buffer gz := gzip.NewWriter(&slug) t.Assert(tar.NewWriter(gz).Close(), c.IsNil) t.Assert(gz.Close(), c.IsNil) slugs := []string{ "http://blobstore.discoverd/1/slug.tgz", "http://blobstore.discoverd/2/slug.tgz", "http://blobstore.discoverd/3/slug.tgz", "http://blobstore.discoverd/4/slug.tgz", "http://blobstore.discoverd/5/slug.tgz", } slugArtifacts := make([]*ct.Artifact, len(slugs)) for i, uri := range slugs { req, err := http.NewRequest("PUT", uri, bytes.NewReader(slug.Bytes())) t.Assert(err, c.IsNil) res, err := http.DefaultClient.Do(req) t.Assert(err, c.IsNil) res.Body.Close() t.Assert(res.StatusCode, c.Equals, http.StatusOK) artifact := &ct.Artifact{ Type: host.ArtifactTypeFile, URI: uri, Meta: map[string]string{"blobstore": "true"}, } t.Assert(client.CreateArtifact(artifact), c.IsNil) slugArtifacts[i] = artifact } // create 6 releases, the second being scaled up and having the // same slug as the third (so prevents the slug being deleted) releases := make([]*ct.Release, 6) for i, r := range []struct { slug *ct.Artifact active bool }{ {slugArtifacts[0], false}, {slugArtifacts[1], true}, {slugArtifacts[1], false}, {slugArtifacts[2], false}, {slugArtifacts[3], false}, {slugArtifacts[4], false}, } { release := &ct.Release{ ArtifactIDs: []string{imageArtifact.ID, r.slug.ID}, Processes: map[string]ct.ProcessType{ "app": {Args: []string{"/bin/pingserv"}, Ports: []ct.Port{{Proto: "tcp"}}}, }, } t.Assert(client.CreateRelease(release), c.IsNil) procs := map[string]int{"app": 0} if r.active { procs["app"] = 1 } t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: procs, }), c.IsNil) releases[i] = release } // scale the last release so we can deploy it lastRelease := releases[len(releases)-1] watcher, err := client.WatchJobEvents(app.ID, lastRelease.ID) t.Assert(err, c.IsNil) defer watcher.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: lastRelease.ID, Processes: map[string]int{"app": 1}, }), c.IsNil) t.Assert(watcher.WaitFor(ct.JobEvents{"app": ct.JobUpEvents(1)}, scaleTimeout, nil), c.IsNil) t.Assert(client.SetAppRelease(app.ID, lastRelease.ID), c.IsNil) // subscribe to garbage collection events gcEvents := make(chan *ct.Event) stream, err := client.StreamEvents(ct.StreamEventsOptions{ AppID: app.ID, ObjectTypes: []ct.EventType{ct.EventTypeAppGarbageCollection}, }, gcEvents) t.Assert(err, c.IsNil) defer stream.Close() // deploy a new release with the same slug as the last release timeoutCh := make(chan struct{}) time.AfterFunc(5*time.Minute, func() { close(timeoutCh) }) newRelease := *lastRelease newRelease.ID = "" t.Assert(client.CreateRelease(&newRelease), c.IsNil) t.Assert(client.DeployAppRelease(app.ID, newRelease.ID, timeoutCh), c.IsNil) // wait for garbage collection select { case event, ok := <-gcEvents: if !ok { t.Fatalf("event stream closed unexpectedly: %s", stream.Err()) } var e ct.AppGarbageCollectionEvent t.Assert(json.Unmarshal(event.Data, &e), c.IsNil) if e.Error != "" { t.Fatalf("garbage collection failed: %s", e.Error) } case <-time.After(60 * time.Second): t.Fatal("timed out waiting for garbage collection") } // check we have 4 distinct slug releases (so 5 in total, only 3 are // inactive) list, err := client.AppReleaseList(app.ID) t.Assert(err, c.IsNil) t.Assert(list, c.HasLen, maxInactiveSlugReleases+2) distinctSlugs := make(map[string]struct{}, len(list)) for _, release := range list { files := release.FileArtifactIDs() t.Assert(files, c.HasLen, 1) distinctSlugs[files[0]] = struct{}{} } t.Assert(distinctSlugs, c.HasLen, maxInactiveSlugReleases+1) // check the first and third releases got deleted, but the rest remain assertDeleted := func(release *ct.Release, deleted bool) { _, err := client.GetRelease(release.ID) if deleted { t.Assert(err, c.Equals, controller.ErrNotFound) } else { t.Assert(err, c.IsNil) } } assertDeleted(releases[0], true) assertDeleted(releases[1], false) assertDeleted(releases[2], true) assertDeleted(releases[3], false) assertDeleted(releases[4], false) assertDeleted(releases[5], false) assertDeleted(&newRelease, false) // check the first slug got deleted, but the rest remain s.assertURI(t, slugs[0], http.StatusNotFound) for i := 1; i < len(slugs); i++ { s.assertURI(t, slugs[i], http.StatusOK) } }
func main() { client, err := controller.NewClient("", os.Getenv("CONTROLLER_KEY")) if err != nil { log.Fatalln("Unable to connect to controller:", err) } usage := ` Usage: flynn-receiver <app> <rev> [-e <var>=<val>]... [-m <key>=<val>]... Options: -e,--env <var>=<val> -m,--meta <key>=<val> `[1:] args, _ := docopt.Parse(usage, nil, true, version.String(), false) appName := args.String["<app>"] env, err := parsePairs(args, "--env") if err != nil { log.Fatal(err) } meta, err := parsePairs(args, "--meta") if err != nil { log.Fatal(err) } app, err := client.GetApp(appName) if err == controller.ErrNotFound { log.Fatalf("Unknown app %q", appName) } else if err != nil { log.Fatalln("Error retrieving app:", err) } prevRelease, err := client.GetAppRelease(app.Name) if err == controller.ErrNotFound { prevRelease = &ct.Release{} } else if err != nil { log.Fatalln("Error getting current app release:", err) } fmt.Printf("-----> Building %s...\n", app.Name) var output bytes.Buffer slugURL := fmt.Sprintf("%s/%s.tgz", blobstoreURL, random.UUID()) cmd := exec.Command(exec.DockerImage(os.Getenv("SLUGBUILDER_IMAGE_URI")), slugURL) cmd.Stdout = io.MultiWriter(os.Stdout, &output) cmd.Stderr = os.Stderr cmd.Meta = map[string]string{ "flynn-controller.app": app.ID, "flynn-controller.app_name": app.Name, "flynn-controller.release": prevRelease.ID, } if len(prevRelease.Env) > 0 { stdin, err := cmd.StdinPipe() if err != nil { log.Fatalln(err) } go appendEnvDir(os.Stdin, stdin, prevRelease.Env) } else { cmd.Stdin = os.Stdin } cmd.Env = make(map[string]string) cmd.Env["BUILD_CACHE_URL"] = fmt.Sprintf("%s/%s-cache.tgz", blobstoreURL, app.ID) if buildpackURL, ok := env["BUILDPACK_URL"]; ok { cmd.Env["BUILDPACK_URL"] = buildpackURL } else if buildpackURL, ok := prevRelease.Env["BUILDPACK_URL"]; ok { cmd.Env["BUILDPACK_URL"] = buildpackURL } for _, k := range []string{"SSH_CLIENT_KEY", "SSH_CLIENT_HOSTS"} { if v := os.Getenv(k); v != "" { cmd.Env[k] = v } } if err := cmd.Run(); err != nil { log.Fatalln("Build failed:", err) } var types []string if match := typesPattern.FindSubmatch(output.Bytes()); match != nil { types = strings.Split(string(match[1]), ", ") } fmt.Printf("-----> Creating release...\n") artifact := &ct.Artifact{Type: "docker", URI: os.Getenv("SLUGRUNNER_IMAGE_URI")} if err := client.CreateArtifact(artifact); err != nil { log.Fatalln("Error creating artifact:", err) } release := &ct.Release{ ArtifactID: artifact.ID, Env: prevRelease.Env, Meta: prevRelease.Meta, } if release.Meta == nil { release.Meta = make(map[string]string, len(meta)) } if release.Env == nil { release.Env = make(map[string]string, len(env)) } for k, v := range env { release.Env[k] = v } for k, v := range meta { release.Meta[k] = v } procs := make(map[string]ct.ProcessType) for _, t := range types { proc := prevRelease.Processes[t] proc.Cmd = []string{"start", t} if t == "web" || strings.HasSuffix(t, "-web") { proc.Service = app.Name + "-" + t proc.Ports = []ct.Port{{ Port: 8080, Proto: "tcp", Service: &host.Service{ Name: proc.Service, Create: true, Check: &host.HealthCheck{Type: "tcp"}, }, }} } procs[t] = proc } release.Processes = procs if release.Env == nil { release.Env = make(map[string]string) } release.Env["SLUG_URL"] = slugURL if err := client.CreateRelease(release); err != nil { log.Fatalln("Error creating release:", err) } if err := client.DeployAppRelease(app.Name, release.ID); err != nil { log.Fatalln("Error deploying app release:", err) } fmt.Println("=====> Application deployed") if needsDefaultScale(app.ID, prevRelease.ID, procs, client) { formation := &ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"web": 1}, } watcher, err := client.WatchJobEvents(app.ID, release.ID) if err != nil { log.Fatalln("Error streaming job events", err) return } defer watcher.Close() if err := client.PutFormation(formation); err != nil { log.Fatalln("Error putting formation:", err) } fmt.Println("=====> Waiting for web job to start...") err = watcher.WaitFor(ct.JobEvents{"web": ct.JobUpEvents(1)}, scaleTimeout, func(e *ct.Job) error { switch e.State { case ct.JobStateUp: fmt.Println("=====> Default web formation scaled to 1") case ct.JobStateDown: return fmt.Errorf("Failed to scale web process type") } return nil }) if err != nil { log.Fatalln(err.Error()) } } }
func (d *DeployJob) deployOneByOneWithWaitFn(waitJobs WaitJobsFn) error { log := d.logger.New("fn", "deployOneByOne") log.Info("starting one-by-one deployment") oldScale := make(map[string]int, len(d.oldReleaseState)) for typ, count := range d.oldReleaseState { oldScale[typ] = count if d.isOmni(typ) { oldScale[typ] /= d.hostCount } } newScale := make(map[string]int, len(d.newReleaseState)) for typ, count := range d.newReleaseState { newScale[typ] = count if d.isOmni(typ) { newScale[typ] /= d.hostCount } } processTypes := make([]string, 0, len(d.Processes)) for typ := range d.Processes { processTypes = append(processTypes, typ) } sort.Sort(sort.StringSlice(processTypes)) olog := log.New("release_id", d.OldReleaseID) nlog := log.New("release_id", d.NewReleaseID) for _, typ := range processTypes { num := d.Processes[typ] // don't scale processes which no longer exist in the new release if _, ok := d.newRelease.Processes[typ]; !ok { num = 0 } diff := 1 if d.isOmni(typ) { diff = d.hostCount } for i := newScale[typ]; i < num; i++ { nlog.Info("scaling new formation up by one", "type", typ) newScale[typ]++ if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.NewReleaseID, Processes: newScale, }); err != nil { nlog.Error("error scaling new formation up by one", "type", typ, "err", err) return err } nlog.Info(fmt.Sprintf("waiting for %d job up event(s)", diff), "type", typ) if err := waitJobs(d.NewReleaseID, ct.JobEvents{typ: ct.JobUpEvents(diff)}, nlog); err != nil { nlog.Error("error waiting for job up events", "err", err) return err } olog.Info("scaling old formation down by one", "type", typ) oldScale[typ]-- if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.OldReleaseID, Processes: oldScale, }); err != nil { olog.Error("error scaling old formation down by one", "type", typ, "err", err) return err } olog.Info(fmt.Sprintf("waiting for %d job down event(s)", diff), "type", typ) if err := waitJobs(d.OldReleaseID, ct.JobEvents{typ: ct.JobDownEvents(diff)}, olog); err != nil { olog.Error("error waiting for job down events", "err", err) return err } } } // ensure any old leftover jobs are stopped (this can happen when new // workers continue deployments from old workers and still see the // old worker running even though it has been scaled down), returning // ErrSkipRollback if an error occurs (rolling back doesn't make a ton // of sense because it involves stopping the new working jobs). log.Info("ensuring old formation is scaled down to zero") diff := make(ct.JobEvents, len(oldScale)) for typ, count := range oldScale { if count > 0 { diff[typ] = ct.JobDownEvents(count) } } if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.OldReleaseID, }); err != nil { log.Error("error scaling old formation down to zero", "err", err) return ErrSkipRollback{err.Error()} } // treat the deployment as finished now (rather than potentially // waiting for the jobs to actually stop) as we can trust that the // scheduler will actually kill the jobs, so no need to delay the // deployment. log.Info("finished one-by-one deployment") return nil }
func (d *DeployJob) deployAllAtOnce() error { log := d.logger.New("fn", "deployAllAtOnce") log.Info("starting all-at-once deployment") expected := make(ct.JobEvents) newProcs := make(map[string]int, len(d.Processes)) for typ, n := range d.Processes { // ignore processes which no longer exist in the new // release if _, ok := d.newRelease.Processes[typ]; !ok { continue } newProcs[typ] = n total := n if d.isOmni(typ) { total *= d.hostCount } existing := d.newReleaseState[typ] if total > existing { expected[typ] = ct.JobUpEvents(total - existing) } } if expected.Count() > 0 { log := log.New("release_id", d.NewReleaseID) log.Info("creating new formation", "processes", newProcs) if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.NewReleaseID, Processes: newProcs, }); err != nil { log.Error("error creating new formation", "err", err) return err } log.Info("waiting for job events", "expected", expected) if err := d.waitForJobEvents(d.NewReleaseID, expected, log); err != nil { log.Error("error waiting for job events", "err", err) return err } } expected = make(ct.JobEvents) for typ := range d.Processes { if existing := d.oldReleaseState[typ]; existing > 0 { expected[typ] = ct.JobDownEvents(existing) } } log = log.New("release_id", d.OldReleaseID) log.Info("scaling old formation to zero") if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.OldReleaseID, }); err != nil { // the new jobs have now started and they are up, so return // ErrSkipRollback (rolling back doesn't make a ton of sense // because it involves stopping the new working jobs). log.Error("error scaling old formation to zero", "err", err) return ErrSkipRollback{err.Error()} } // treat the deployment as finished now (rather than waiting for the // jobs to actually stop) as we can trust that the scheduler will // actually kill the jobs, so no need to delay the deployment. log.Info("finished all-at-once deployment") return nil }
func (d *DeployJob) deployOneByOneWithWaitFn(waitJobs WaitJobsFn) error { log := d.logger.New("fn", "deployOneByOne") log.Info("starting one-by-one deployment") oldScale := make(map[string]int, len(d.oldReleaseState)) for typ, count := range d.oldReleaseState { oldScale[typ] = count if d.isOmni(typ) { oldScale[typ] /= d.hostCount } } newScale := make(map[string]int, len(d.newReleaseState)) for typ, count := range d.newReleaseState { newScale[typ] = count if d.isOmni(typ) { newScale[typ] /= d.hostCount } } processTypes := make([]string, 0, len(d.Processes)) for typ := range d.Processes { processTypes = append(processTypes, typ) } sort.Sort(sort.StringSlice(processTypes)) olog := log.New("release_id", d.OldReleaseID) nlog := log.New("release_id", d.NewReleaseID) for _, typ := range processTypes { num := d.Processes[typ] diff := 1 if d.isOmni(typ) { diff = d.hostCount } for i := newScale[typ]; i < num; i++ { nlog.Info("scaling new formation up by one", "type", typ) newScale[typ]++ if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.NewReleaseID, Processes: newScale, }); err != nil { nlog.Error("error scaling new formation up by one", "type", typ, "err", err) return err } for i := 0; i < diff; i++ { d.deployEvents <- ct.DeploymentEvent{ ReleaseID: d.NewReleaseID, JobState: ct.JobStateStarting, JobType: typ, } } nlog.Info(fmt.Sprintf("waiting for %d job up event(s)", diff), "type", typ) if err := waitJobs(d.NewReleaseID, ct.JobEvents{typ: ct.JobUpEvents(diff)}, nlog); err != nil { nlog.Error("error waiting for job up events", "err", err) return err } olog.Info("scaling old formation down by one", "type", typ) oldScale[typ]-- if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.OldReleaseID, Processes: oldScale, }); err != nil { olog.Error("error scaling old formation down by one", "type", typ, "err", err) return err } for i := 0; i < diff; i++ { d.deployEvents <- ct.DeploymentEvent{ ReleaseID: d.OldReleaseID, JobState: ct.JobStateStopping, JobType: typ, } } olog.Info(fmt.Sprintf("waiting for %d job down event(s)", diff), "type", typ) if err := waitJobs(d.OldReleaseID, ct.JobEvents{typ: ct.JobDownEvents(diff)}, olog); err != nil { olog.Error("error waiting for job down events", "err", err) return err } } } // ensure any old leftover jobs are stopped (this can happen when new // workers continue deployments from old workers and still see the // old worker running even though it has been scaled down). log.Info("ensuring old formation is scaled down to zero") diff := make(ct.JobEvents, len(oldScale)) for typ, count := range oldScale { diff[typ] = ct.JobDownEvents(count) } if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.OldReleaseID, }); err != nil { log.Error("error scaling old formation down to zero", "err", err) return err } if diff.Count() > 0 { log.Info(fmt.Sprintf("waiting for %d job down event(s)", diff.Count())) if err := d.waitForJobEvents(d.OldReleaseID, diff, log); err != nil { log.Error("error waiting for job down events", "err", err) return err } } log.Info("finished one-by-one deployment") return nil }
func (d *DeployJob) deployAllAtOnce() error { log := d.logger.New("fn", "deployAllAtOnce") log.Info("starting all-at-once deployment") expected := make(ct.JobEvents) for typ, n := range d.Processes { total := n if d.isOmni(typ) { total *= d.hostCount } existing := d.newReleaseState[typ] for i := existing; i < total; i++ { d.deployEvents <- ct.DeploymentEvent{ ReleaseID: d.NewReleaseID, JobState: ct.JobStateStarting, JobType: typ, } } if total > existing { expected[typ] = ct.JobUpEvents(total - existing) } } if expected.Count() > 0 { log := log.New("release_id", d.NewReleaseID) log.Info("creating new formation", "processes", d.Processes) if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.NewReleaseID, Processes: d.Processes, }); err != nil { log.Error("error creating new formation", "err", err) return err } log.Info("waiting for job events", "expected", expected) if err := d.waitForJobEvents(d.NewReleaseID, expected, log); err != nil { log.Error("error waiting for job events", "err", err) return err } } expected = make(ct.JobEvents) for typ := range d.Processes { existing := d.oldReleaseState[typ] for i := 0; i < existing; i++ { d.deployEvents <- ct.DeploymentEvent{ ReleaseID: d.OldReleaseID, JobState: ct.JobStateStopping, JobType: typ, } } if existing > 0 { expected[typ] = ct.JobDownEvents(existing) } } log = log.New("release_id", d.OldReleaseID) log.Info("scaling old formation to zero") if err := d.client.PutFormation(&ct.Formation{ AppID: d.AppID, ReleaseID: d.OldReleaseID, }); err != nil { log.Error("error scaling old formation to zero", "err", err) return err } if expected.Count() > 0 { log.Info("waiting for job events", "expected", expected) if err := d.waitForJobEvents(d.OldReleaseID, expected, log); err != nil { log.Error("error waiting for job events", "err", err) // we have started the new jobs (and they are up) and requested that the old jobs stop. at this point // there's not much more we can do. Rolling back doesn't make a ton of sense because it involves // stopping the new (working) jobs. return ErrSkipRollback{err.Error()} } } log.Info("finished all-at-once deployment") return nil }
func (s *SchedulerSuite) TestGracefulShutdown(t *c.C) { app, release := s.createApp(t) client := s.controllerClient(t) debug(t, "scaling to blocker=1") watcher, err := client.WatchJobEvents(app.ID, release.ID) t.Assert(err, c.IsNil) defer watcher.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"blocker": 1}, }), c.IsNil) var jobID string err = watcher.WaitFor(ct.JobEvents{"blocker": ct.JobUpEvents(1)}, scaleTimeout, func(job *ct.Job) error { jobID = job.ID return nil }) t.Assert(err, c.IsNil) jobs, err := s.discoverdClient(t).Instances("test-http-blocker", 10*time.Second) t.Assert(err, c.IsNil) t.Assert(jobs, c.HasLen, 1) jobAddr := jobs[0].Addr debug(t, "subscribing to backend events from all routers") routers, err := s.discoverdClient(t).Instances("router-api", 10*time.Second) t.Assert(err, c.IsNil) routerEvents := make(chan *router.StreamEvent) for _, r := range routers { events := make(chan *router.StreamEvent) stream, err := routerc.NewWithAddr(r.Addr).StreamEvents(&router.StreamEventsOptions{ EventTypes: []router.EventType{ router.EventTypeBackendUp, router.EventTypeBackendDown, router.EventTypeBackendDrained, }, }, events) t.Assert(err, c.IsNil) defer stream.Close() go func(router *discoverd.Instance) { for event := range events { if event.Backend != nil && event.Backend.JobID == jobID { debugf(t, "got %s router event from %s", event.Event, router.Host()) routerEvents <- event } } }(r) } debug(t, "adding HTTP route with backend drain enabled") route := &router.HTTPRoute{ Domain: random.String(32) + ".com", Service: "test-http-blocker", DrainBackends: true, } t.Assert(client.CreateRoute(app.ID, route.ToRoute()), c.IsNil) waitForRouterEvents := func(typ router.EventType) { debugf(t, "waiting for %d router %s events", len(routers), typ) count := 0 for { select { case event := <-routerEvents: if event.Event != typ { t.Fatal("expected %s router event, got %s", typ, event.Event) } count++ if count == len(routers) { return } case <-time.After(30 * time.Second): t.Fatalf("timed out waiting for router %s events", typ) } } } waitForRouterEvents(router.EventTypeBackendUp) debug(t, "making blocked HTTP request through each router") reqErrs := make(chan error) for _, router := range routers { req, err := http.NewRequest("GET", "http://"+router.Host()+"/block", nil) t.Assert(err, c.IsNil) req.Host = route.Domain res, err := http.DefaultClient.Do(req) t.Assert(err, c.IsNil) t.Assert(res.StatusCode, c.Equals, http.StatusOK) go func() { defer res.Body.Close() data, err := ioutil.ReadAll(res.Body) if err == nil && !bytes.Equal(data, []byte("done")) { err = fmt.Errorf("unexpected response: %q", data) } reqErrs <- err }() } debug(t, "scaling to blocker=0") t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"blocker": 0}, }), c.IsNil) t.Assert(watcher.WaitFor(ct.JobEvents{"blocker": {ct.JobStateStopping: 1}}, scaleTimeout, nil), c.IsNil) waitForRouterEvents(router.EventTypeBackendDown) debug(t, "checking new HTTP requests return 503") for _, router := range routers { req, err := http.NewRequest("GET", "http://"+router.Host()+"/ping", nil) t.Assert(err, c.IsNil) req.Host = route.Domain res, err := http.DefaultClient.Do(req) t.Assert(err, c.IsNil) res.Body.Close() t.Assert(res.StatusCode, c.Equals, http.StatusServiceUnavailable) } debug(t, "checking blocked HTTP requests are still blocked") select { case err := <-reqErrs: t.Fatal(err) default: } debug(t, "unblocking HTTP requests") res, err := http.Get("http://" + jobAddr + "/unblock") t.Assert(err, c.IsNil) t.Assert(res.StatusCode, c.Equals, http.StatusOK) debug(t, "checking the blocked HTTP requests completed without error") for range routers { if err := <-reqErrs; err != nil { t.Fatal(err) } } waitForRouterEvents(router.EventTypeBackendDrained) debug(t, "waiting for the job to exit") t.Assert(watcher.WaitFor(ct.JobEvents{"blocker": ct.JobDownEvents(1)}, scaleTimeout, nil), c.IsNil) }