Example #1
0
func waitForJobEvents(t *c.C, events chan *ct.JobEvent, expected jobEvents) (lastID int64, jobID string) {
	debugf(t, "waiting for job events: %v", expected)
	actual := make(jobEvents)
	for {
	inner:
		select {
		case event := <-events:
			debug(t, "got job event: ", event.Type, event.JobID, event.State)
			lastID = event.ID
			jobID = event.JobID
			if _, ok := actual[event.Type]; !ok {
				actual[event.Type] = make(map[string]int)
			}
			switch event.State {
			case "up":
				actual[event.Type]["up"] += 1
			case "down", "crashed":
				actual[event.Type]["down"] += 1
			default:
				break inner
			}
			if jobEventsEqual(expected, actual) {
				return
			}
		case <-time.After(60 * time.Second):
			t.Fatal("timed out waiting for job events: ", expected)
		}
	}
}
Example #2
0
func waitForDeploymentEvents(t *c.C, stream chan *ct.DeploymentEvent, expected []*ct.DeploymentEvent) {
	debugf(t, "waiting for %d deployment events", len(expected))
	actual := make([]*ct.DeploymentEvent, 0, len(expected))
loop:
	for {
		select {
		case e, ok := <-stream:
			if !ok {
				t.Fatal("unexpected close of deployment event stream")
			}
			actual = append(actual, e)
			if e.Status == "complete" || e.Status == "failed" {
				debugf(t, "got deployment event: %s", e.Status)
				break loop
			}
			debugf(t, "got deployment event: %s %s", e.JobType, e.JobState)
		case <-time.After(60 * time.Second):
			t.Fatal("timed out waiting for deployment event")
		}
	}
	compare := func(t *c.C, i *ct.DeploymentEvent, j *ct.DeploymentEvent) {
		t.Assert(i.ReleaseID, c.Equals, j.ReleaseID)
		t.Assert(i.JobType, c.Equals, j.JobType)
		t.Assert(i.JobState, c.Equals, j.JobState)
		t.Assert(i.Status, c.Equals, j.Status)
		t.Assert(i.Error, c.Equals, j.Error)
	}

	for i, e := range expected {
		compare(t, actual[i], e)
	}
}
Example #3
0
func (s *HostSuite) TestResourceLimits(t *c.C) {
	cmd := exec.JobUsingCluster(
		s.clusterClient(t),
		exec.DockerImage(imageURIs["test-apps"]),
		&host.Job{
			Config:    host.ContainerConfig{Cmd: []string{"sh", "-c", resourceCmd}},
			Resources: testResources(),
		},
	)
	var out bytes.Buffer
	cmd.Stdout = &out

	runErr := make(chan error)
	go func() {
		runErr <- cmd.Run()
	}()
	select {
	case err := <-runErr:
		t.Assert(err, c.IsNil)
	case <-time.After(30 * time.Second):
		t.Fatal("timed out waiting for resource limits job")
	}

	assertResourceLimits(t, out.String())
}
Example #4
0
func (s *HostSuite) TestAttachFinishedInteractiveJob(t *c.C) {
	cluster := s.clusterClient(t)

	// run a quick interactive job
	cmd := exec.CommandUsingCluster(cluster, exec.DockerImage(imageURIs["test-apps"]), "/bin/true")
	cmd.TTY = true
	runErr := make(chan error)
	go func() {
		runErr <- cmd.Run()
	}()
	select {
	case err := <-runErr:
		t.Assert(err, c.IsNil)
	case <-time.After(30 * time.Second):
		t.Fatal("timed out waiting for interactive job")
	}

	h, err := cluster.Host(cmd.HostID)
	t.Assert(err, c.IsNil)

	// Getting the logs for the job should fail, as it has none because it was
	// interactive
	attachErr := make(chan error)
	go func() {
		_, err = h.Attach(&host.AttachReq{JobID: cmd.Job.ID, Flags: host.AttachFlagLogs}, false)
		attachErr <- err
	}()
	select {
	case err := <-attachErr:
		t.Assert(err, c.NotNil)
	case <-time.After(time.Second):
		t.Error("timed out waiting for attach")
	}
}
Example #5
0
func (s *SchedulerSuite) TestTCPApp(t *c.C) {
	app, _ := s.createApp(t)

	t.Assert(flynn(t, "/", "-a", app.Name, "scale", "echoer=1"), Succeeds)

	newRoute := flynn(t, "/", "-a", app.Name, "route", "add", "tcp", "-s", "echo-service")
	t.Assert(newRoute, Succeeds)
	t.Assert(newRoute.Output, Matches, `.+ on port \d+`)
	str := strings.Split(strings.TrimSpace(string(newRoute.Output)), " ")
	port := str[len(str)-1]

	// use Attempts to give the processes time to start
	if err := Attempts.Run(func() error {
		servAddr := routerIP + ":" + port
		conn, err := net.Dial("tcp", servAddr)
		if err != nil {
			return err
		}
		defer conn.Close()
		msg := []byte("hello there!\n")
		_, err = conn.Write(msg)
		if err != nil {
			return err
		}
		reply := make([]byte, len(msg))
		_, err = conn.Read(reply)
		if err != nil {
			return err
		}
		t.Assert(reply, c.DeepEquals, msg)
		return nil
	}); err != nil {
		t.Fatal(err)
	}
}
Example #6
0
// TestAppEvents checks that streaming events for an app only receives events
// for that particular app.
func (s *ControllerSuite) TestAppEvents(t *c.C) {
	client := s.controllerClient(t)
	app1, release1 := s.createApp(t)
	app2, release2 := s.createApp(t)

	// stream events for app1
	events := make(chan *ct.Job)
	stream, err := client.StreamJobEvents(app1.ID, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	runJob := func(appID, releaseID string) {
		rwc, err := client.RunJobAttached(appID, &ct.NewJob{
			ReleaseID:  releaseID,
			Cmd:        []string{"/bin/true"},
			DisableLog: true,
		})
		t.Assert(err, c.IsNil)
		rwc.Close()
	}

	// generate events for app2 and wait for them
	watcher, err := client.WatchJobEvents(app2.ID, release2.ID)
	t.Assert(err, c.IsNil)
	defer watcher.Close()
	runJob(app2.ID, release2.ID)
	t.Assert(watcher.WaitFor(
		ct.JobEvents{"": {ct.JobStateUp: 1, ct.JobStateDown: 1}},
		10*time.Second,
		func(e *ct.Job) error {
			debugf(t, "got %s job event for app2", e.State)
			return nil
		},
	), c.IsNil)

	// generate events for app1
	runJob(app1.ID, release1.ID)

	// check the stream only gets events for app1
	for {
		select {
		case e, ok := <-events:
			if !ok {
				t.Fatal("unexpected close of job event stream")
			}
			t.Assert(e.AppID, c.Equals, app1.ID)
			debugf(t, "got %s job event for app1", e.State)
			if e.State == ct.JobStateDown {
				return
			}
		case <-time.After(10 * time.Second):
			t.Fatal("timed out waiting for job events for app1")
		}
	}
}
Example #7
0
func waitForJobRestart(t *c.C, events chan *ct.JobEvent, typ string, timeout time.Duration) string {
	debug(t, "waiting for job restart")
	for {
		select {
		case event := <-events:
			debug(t, "got job event: ", event.Type, event.JobID, event.State)
			if event.Type == typ && event.State == "up" {
				return event.JobID
			}
		case <-time.After(timeout):
			t.Fatal("timed out waiting for job restart")
		}
	}
}
Example #8
0
func (s *HostSuite) TestUpdateTags(t *c.C) {
	events := make(chan *discoverd.Event)
	stream, err := s.discoverdClient(t).Service("flynn-host").Watch(events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	nextEvent := func() *discoverd.Event {
		select {
		case e, ok := <-events:
			if !ok {
				t.Fatal("unexpected close of discoverd stream")
			}
			return e
		case <-time.After(10 * time.Second):
			t.Fatal("timed out waiting for discoverd event")
		}
		return nil
	}

	var client *cluster.Host
	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUp && client == nil {
			client = cluster.NewHost(e.Instance.Meta["id"], e.Instance.Addr, nil)
		}
		if e.Kind == discoverd.EventKindCurrent {
			break
		}
	}
	if client == nil {
		t.Fatal("did not initialize flynn-host client")
	}

	t.Assert(client.UpdateTags(map[string]string{"foo": "bar"}), c.IsNil)

	var meta map[string]string
	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() {
			meta = e.Instance.Meta
			break
		}
	}
	t.Assert(meta["tag:foo"], c.Equals, "bar")

	// setting to empty string should delete the tag
	t.Assert(client.UpdateTags(map[string]string{"foo": ""}), c.IsNil)

	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() {
			meta = e.Instance.Meta
			break
		}
	}
	if _, ok := meta["tag:foo"]; ok {
		t.Fatal("expected tag to be deleted but is still present")
	}
}
Example #9
0
func (s *CLISuite) TestRun(t *c.C) {
	app := s.newCliTestApp(t)
	defer app.cleanup()

	// this shouldn't be logged
	t.Assert(app.sh("echo foo"), Outputs, "foo\n")
	// drain the events
	app.waitFor(ct.JobEvents{"": {ct.JobStateUp: 1, ct.JobStateDown: 1}})

	// this should be logged due to the --enable-log flag
	t.Assert(app.flynn("run", "--enable-log", "echo", "hello"), Outputs, "hello\n")
	app.waitFor(ct.JobEvents{"": {ct.JobStateUp: 1, ct.JobStateDown: 1}})

	detached := app.flynn("run", "-d", "echo", "world")
	t.Assert(detached, Succeeds)
	t.Assert(detached, c.Not(Outputs), "world\n")

	id := strings.TrimSpace(detached.Output)
	jobID := app.waitFor(ct.JobEvents{"": {ct.JobStateUp: 1, ct.JobStateDown: 1}})
	t.Assert(jobID, c.Equals, id)
	t.Assert(app.flynn("log", "--raw-output"), Outputs, "hello\nworld\n")

	// test stdin and stderr
	streams := app.flynnCmd("run", "sh", "-c", "cat 1>&2")
	stdin, err := streams.StdinPipe()
	t.Assert(err, c.IsNil)
	go func() {
		stdin.Write([]byte("goto stderr"))
		stdin.Close()
	}()
	var stderr bytes.Buffer
	var stdout bytes.Buffer
	streams.Stderr = &stderr
	streams.Stdout = &stdout
	t.Assert(streams.Run(), c.IsNil)
	t.Assert(stderr.String(), c.Equals, "goto stderr")
	t.Assert(stdout.String(), c.Equals, "")

	// test exit code
	exit := app.sh("exit 42")
	t.Assert(exit, c.Not(Succeeds))
	if msg, ok := exit.Err.(*exec.ExitError); ok { // there is error code
		code := msg.Sys().(syscall.WaitStatus).ExitStatus()
		t.Assert(code, c.Equals, 42)
	} else {
		t.Fatal("There was no error code!")
	}
}
Example #10
0
func (s *HostSuite) TestAddFailingJob(t *c.C) {
	// get a host and watch events
	hosts, err := s.clusterClient(t).Hosts()
	t.Assert(err, c.IsNil)
	t.Assert(hosts, c.Not(c.HasLen), 0)
	h := hosts[0]
	jobID := random.UUID()
	events := make(chan *host.Event)
	stream, err := h.StreamEvents(jobID, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	// add a job with a non existent partition
	job := &host.Job{
		ID: jobID,
		ImageArtifact: &host.Artifact{
			Type: host.ArtifactTypeDocker,
			URI:  "http://example.com?name=foo&id=bar",
		},
		Partition: "nonexistent",
	}
	t.Assert(h.AddJob(job), c.IsNil)

	// check we get a create then error event
	actual := make([]*host.Event, 0, 2)
loop:
	for {
		select {
		case e, ok := <-events:
			if !ok {
				t.Fatalf("job event stream closed unexpectedly: %s", stream.Err())
			}
			actual = append(actual, e)
			if len(actual) >= 2 {
				break loop
			}
		case <-time.After(30 * time.Second):
			t.Fatal("timed out waiting for job event")
		}
	}
	t.Assert(actual, c.HasLen, 2)
	t.Assert(actual[0].Event, c.Equals, host.JobEventCreate)
	t.Assert(actual[1].Event, c.Equals, host.JobEventError)
	jobErr := actual[1].Job.Error
	t.Assert(jobErr, c.NotNil)
	t.Assert(*jobErr, c.Equals, `host: invalid job partition "nonexistent"`)
}
Example #11
0
func waitForJobRestart(t *c.C, stream stream.Stream, events chan *ct.JobEvent, typ string, timeout time.Duration) string {
	debug(t, "waiting for job restart")
	for {
		select {
		case event, ok := <-events:
			if !ok {
				t.Fatalf("job event stream closed: %s", stream.Err())
			}
			debug(t, "got job event: ", event.Type, event.JobID, event.State)
			if event.Type == typ && event.State == "up" {
				return event.JobID
			}
		case <-time.After(timeout):
			t.Fatal("timed out waiting for job restart")
		}
	}
}
Example #12
0
func (s *CLISuite) TestLimits(t *c.C) {
	app := s.newCliTestApp(t)
	t.Assert(app.flynn("limit", "set", "resources", "memory=512MB", "max_fd=12k"), Succeeds)

	release, err := s.controller.GetAppRelease(app.name)
	t.Assert(err, c.IsNil)
	proc, ok := release.Processes["resources"]
	if !ok {
		t.Fatal("missing resources process type")
	}
	r := proc.Resources
	t.Assert(*r[resource.TypeMemory].Limit, c.Equals, int64(536870912))
	t.Assert(*r[resource.TypeMaxFD].Limit, c.Equals, int64(12000))

	cmd := app.flynn("limit", "-t", "resources")
	t.Assert(cmd, Succeeds)
	t.Assert(cmd, OutputContains, "memory=512MB")
	t.Assert(cmd, OutputContains, "max_fd=12000")
}
Example #13
0
func (s *HostSuite) TestSignalJob(t *c.C) {
	cluster := s.clusterClient(t)

	// pick a host to run the job on
	hosts, err := cluster.Hosts()
	t.Assert(err, c.IsNil)
	client := schedutil.PickHost(hosts)

	// start a signal-service job
	cmd := exec.JobUsingCluster(cluster, exec.DockerImage(imageURIs["test-apps"]), &host.Job{
		Config: host.ContainerConfig{
			Cmd:        []string{"/bin/signal"},
			DisableLog: true,
		},
	})
	cmd.HostID = client.ID()
	var out bytes.Buffer
	cmd.Stdout = &out
	t.Assert(cmd.Start(), c.IsNil)
	_, err = s.discoverdClient(t).Instances("signal-service", 10*time.Second)
	t.Assert(err, c.IsNil)

	// send the job a signal
	t.Assert(client.SignalJob(cmd.Job.ID, int(syscall.SIGTERM)), c.IsNil)

	// wait for the job to exit
	done := make(chan error)
	go func() {
		done <- cmd.Wait()
	}()
	select {
	case err := <-done:
		t.Assert(err, c.IsNil)
	case <-time.After(12 * time.Second):
		t.Fatal("timed out waiting for job to stop")
	}

	// check the output
	t.Assert(out.String(), c.Equals, "got signal: terminated")
}
Example #14
0
func (s *SchedulerSuite) addHosts(t *c.C, count int) []string {
	debugf(t, "adding %d hosts", count)
	ch := make(chan *host.HostEvent)
	stream := s.clusterClient(t).StreamHostEvents(ch)
	defer stream.Close()

	hosts := make([]string, 0, count)
	for i := 0; i < count; i++ {
		res, err := httpClient.PostForm(args.ClusterAPI, url.Values{})
		if err != nil {
			t.Fatal("error in POST request to cluster api:", err)
		}
		res.Body.Close()
		if res.StatusCode != http.StatusOK {
			t.Fatal("expected 200 status, got", res.Status)
		}

		select {
		case event := <-ch:
			debug(t, "host added ", event.HostID)
			hosts = append(hosts, event.HostID)
		case <-time.After(20 * time.Second):
			t.Fatal("timed out waiting for new host")
		}
	}
	return hosts
}
Example #15
0
func (s *SchedulerSuite) TestTCPApp(t *c.C) {
	app, _ := s.createApp(t)

	stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	t.Assert(flynn(t, "/", "-a", app.Name, "scale", "echoer=1"), Succeeds)

	newRoute := flynn(t, "/", "-a", app.Name, "route", "add", "tcp", "-s", "echo-service")
	t.Assert(newRoute, Succeeds)
	t.Assert(newRoute.Output, Matches, `.+ on port \d+`)
	str := strings.Split(strings.TrimSpace(string(newRoute.Output)), " ")
	port := str[len(str)-1]

	waitForJobEvents(t, stream.Events, jobEvents{"echoer": {"up": 1}})
	// use Attempts to give the processes time to start
	if err := Attempts.Run(func() error {
		servAddr := routerIP + ":" + port
		conn, err := net.Dial("tcp", servAddr)
		if err != nil {
			return err
		}
		defer conn.Close()
		echo := random.Bytes(16)
		_, err = conn.Write(echo)
		if err != nil {
			return err
		}
		reply := make([]byte, 16)
		_, err = conn.Read(reply)
		if err != nil {
			return err
		}
		t.Assert(reply, c.DeepEquals, echo)
		return nil
	}); err != nil {
		t.Fatal(err)
	}
}
Example #16
0
func (s *HostSuite) TestDevSHM(t *c.C) {
	cmd := exec.CommandUsingCluster(
		s.clusterClient(t),
		exec.DockerImage(imageURIs["test-apps"]),
		"sh", "-c", "df -h /dev/shm && echo foo > /dev/shm/asdf",
	)
	var out bytes.Buffer
	cmd.Stdout = &out
	cmd.Stderr = &out

	runErr := make(chan error)
	go func() {
		runErr <- cmd.Run()
	}()
	select {
	case err := <-runErr:
		t.Assert(err, c.IsNil)
	case <-time.After(30 * time.Second):
		t.Fatal("timed out waiting for /dev/shm job")
	}

	t.Assert(out.String(), c.Equals, "Filesystem                Size      Used Available Use% Mounted on\ntmpfs                    64.0M         0     64.0M   0% /dev/shm\n")
}
Example #17
0
func (h *Helper) addHosts(t *c.C, count int, vanilla bool) []*tc.Instance {
	debugf(t, "adding %d hosts", count)

	// wait for the router-api to start on the host (rather than using
	// StreamHostEvents) as we wait for router-api when removing the
	// host (so that could fail if the router-api never starts).
	events := make(chan *discoverd.Event)
	stream, err := h.discoverdClient(t).Service("router-api").Watch(events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	// wait for the current state
loop:
	for {
		select {
		case e, ok := <-events:
			if !ok {
				t.Fatal("event stream closed unexpectedly")
			}
			if e.Kind == discoverd.EventKindCurrent {
				break loop
			}
		case <-time.After(10 * time.Second):
			t.Fatal("timed out waiting for current service state")
		}
	}

	hosts := make([]*tc.Instance, count)
	for i := 0; i < count; i++ {
		host, err := testCluster.AddHost(events, vanilla)
		t.Assert(err, c.IsNil)
		debugf(t, "host added: %s", host.ID)
		hosts[i] = host
	}
	return hosts
}
Example #18
0
func (s *SchedulerSuite) removeHosts(t *c.C, ids []string) {
	debugf(t, "removing %d hosts", len(ids))

	// Wait for router-api services to disappear to indicate host
	// removal (rather than using StreamHostEvents), so that other
	// tests won't try and connect to this host via service discovery.
	set, err := s.discoverdClient(t).NewServiceSet("router-api")
	t.Assert(err, c.IsNil)
	defer set.Close()
	updates := set.Watch(false)
	defer set.Unwatch(updates)

	for _, id := range ids {
		req, err := http.NewRequest("DELETE", args.ClusterAPI+"?host="+id, nil)
		if err != nil {
			t.Fatal("error in DELETE request to cluster api:", err)
		}
		res, err := httpClient.Do(req)
		if err != nil {
			t.Fatal("error in DELETE request to cluster api:", err)
		}
		res.Body.Close()
		if res.StatusCode != http.StatusOK {
			t.Fatal("expected 200 status, got", res.Status)
		}

	loop:
		for {
			select {
			case update := <-updates:
				if !update.Online {
					debug(t, "host removed ", update.Addr)
					break loop
				}
			case <-time.After(20 * time.Second):
				t.Fatal("timed out waiting for host removal")
			}
		}
	}
}
Example #19
0
func (s *ZDiscoverdSuite) TestDeploy(t *c.C) {
	// ensure we have enough hosts in the cluster
	hosts, err := s.clusterClient(t).Hosts()
	t.Assert(err, c.IsNil)
	if len(hosts) <= 1 {
		t.Skip("cannot deploy discoverd in a single node cluster")
	}

	client := s.controllerClient(t)
	app, err := client.GetApp("discoverd")
	t.Assert(err, c.IsNil)
	release, err := client.GetAppRelease(app.ID)
	t.Assert(err, c.IsNil)
	release.ID = ""
	t.Assert(client.CreateRelease(release), c.IsNil)
	deployment, err := client.CreateDeployment(app.ID, release.ID)
	t.Assert(err, c.IsNil)

	events := make(chan *ct.DeploymentEvent)
	stream, err := client.StreamDeployment(deployment, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

loop:
	for {
		select {
		case event, ok := <-events:
			if !ok {
				t.Fatal("unexpected close of deployment event stream")
			}
			if event.Status == "complete" {
				debugf(t, "got deployment event: %s", event.Status)
				break loop
			}
			if event.Status == "failed" {
				t.Fatal("the deployment failed")
			}
			debugf(t, "got deployment event: %s %s", event.JobType, event.JobState)
		case <-time.After(time.Duration(app.DeployTimeout) * time.Second):
			t.Fatal("timed out waiting for deployment event")
		}
	}
}
Example #20
0
func (s *HostSuite) TestUpdate(t *c.C) {
	dir := t.MkDir()
	flynnHost := filepath.Join(dir, "flynn-host")
	run(t, osexec.Command("cp", args.FlynnHost, flynnHost))

	// start flynn-host
	id := random.String(8)
	var out bytes.Buffer
	cmd := osexec.Command(
		flynnHost,
		"daemon",
		"--http-port", "11113",
		"--state", filepath.Join(dir, "host-state.bolt"),
		"--id", id,
		"--backend", "mock",
		"--vol-provider", "mock",
		"--volpath", filepath.Join(dir, "volumes"),
	)
	cmd.Stdout = &out
	cmd.Stderr = &out
	defer func() {
		debug(t, "*** flynn-host output ***")
		debug(t, out.String())
		debug(t, "*************************")
	}()
	t.Assert(cmd.Start(), c.IsNil)
	defer cmd.Process.Kill()

	httpClient := &http.Client{Transport: &http.Transport{Dial: dialer.Retry.Dial}}
	client := cluster.NewHost(id, "http://127.0.0.1:11113", httpClient)

	// exec a program which exits straight away
	_, err := client.Update("/bin/true")
	t.Assert(err, c.NotNil)
	status, err := client.GetStatus()
	t.Assert(err, c.IsNil)
	t.Assert(status.ID, c.Equals, id)
	t.Assert(status.PID, c.Equals, cmd.Process.Pid)

	// exec a program which reads the control socket but then exits
	_, err = client.Update("/bin/bash", "-c", "<&4; exit")
	t.Assert(err, c.NotNil)
	status, err = client.GetStatus()
	t.Assert(err, c.IsNil)
	t.Assert(status.ID, c.Equals, id)
	t.Assert(status.PID, c.Equals, cmd.Process.Pid)

	// exec flynn-host and check we get the status from the new daemon
	pid, err := client.Update(
		flynnHost,
		"daemon",
		"--http-port", "11113",
		"--state", filepath.Join(dir, "host-state.bolt"),
		"--id", id,
		"--backend", "mock",
		"--vol-provider", "mock",
		"--volpath", filepath.Join(dir, "volumes"),
	)
	t.Assert(err, c.IsNil)
	defer syscall.Kill(pid, syscall.SIGKILL)

	done := make(chan struct{})
	go func() {
		cmd.Process.Signal(syscall.SIGTERM)
		syscall.Wait4(cmd.Process.Pid, nil, 0, nil)
		close(done)
	}()
	select {
	case <-done:
	case <-time.After(15 * time.Second):
		t.Fatal("timed out waiting for flynn-host daemon to exit")
	}

	// client.GetStatus intermittently returns io.EOF right after the update. We
	// don't currently understand why (likely due to the way the listener is
	// passed around), so for now just retry the request.
	//
	// TODO(lmars): figure out why and remove this loop.
	delay := 100 * time.Millisecond
	for start := time.Now(); time.Since(start) < 10*time.Second; time.Sleep(delay) {
		status, err = client.GetStatus()
		if e, ok := err.(*url.Error); ok && strings.Contains(e.Err.Error(), "EOF") {
			debugf(t, "got io.EOF from flynn-host, trying again in %s", delay)
			continue
		}
		break
	}
	t.Assert(err, c.IsNil)
	t.Assert(status.ID, c.Equals, id)
	t.Assert(status.PID, c.Equals, pid)
}
Example #21
0
func testSireniaDeploy(client controller.Client, disc *discoverd.Client, t *c.C, d *sireniaDeploy) {
	// create app
	app := &ct.App{Name: d.name, Strategy: "sirenia"}
	t.Assert(client.CreateApp(app), c.IsNil)

	// copy release from default app
	release, err := client.GetAppRelease(d.db.appName)
	t.Assert(err, c.IsNil)
	release.ID = ""
	release.Env[d.db.hostKey] = fmt.Sprintf("leader.%s.discoverd", d.name)
	release.Env[d.db.serviceKey] = d.name
	procName := release.Env["SIRENIA_PROCESS"]
	proc := release.Processes[procName]
	delete(proc.Env, "SINGLETON")
	proc.Service = d.name
	release.Processes[procName] = proc
	t.Assert(client.CreateRelease(release), c.IsNil)
	t.Assert(client.SetAppRelease(app.ID, release.ID), c.IsNil)
	oldRelease := release.ID

	// create formation
	discEvents := make(chan *discoverd.Event)
	discService := disc.Service(d.name)
	discStream, err := discService.Watch(discEvents)
	t.Assert(err, c.IsNil)
	defer discStream.Close()
	jobEvents := make(chan *ct.Job)
	jobStream, err := client.StreamJobEvents(d.name, jobEvents)
	t.Assert(err, c.IsNil)
	defer jobStream.Close()
	t.Assert(client.PutFormation(&ct.Formation{
		AppID:     app.ID,
		ReleaseID: release.ID,
		Processes: map[string]int{procName: d.sireniaJobs, "web": d.webJobs},
	}), c.IsNil)

	// watch cluster state changes
	type stateChange struct {
		state *state.State
		err   error
	}
	stateCh := make(chan stateChange)
	go func() {
		for event := range discEvents {
			if event.Kind != discoverd.EventKindServiceMeta {
				continue
			}
			var state state.State
			if err := json.Unmarshal(event.ServiceMeta.Data, &state); err != nil {
				stateCh <- stateChange{err: err}
				return
			}
			primary := ""
			if state.Primary != nil {
				primary = state.Primary.Addr
			}
			sync := ""
			if state.Sync != nil {
				sync = state.Sync.Addr
			}
			var async []string
			for _, a := range state.Async {
				async = append(async, a.Addr)
			}
			debugf(t, "got cluster state: index=%d primary=%s sync=%s async=%s",
				event.ServiceMeta.Index, primary, sync, strings.Join(async, ","))
			stateCh <- stateChange{state: &state}
		}
	}()

	// wait for correct cluster state and number of web processes
	var sireniaState state.State
	var webJobs int
	ready := func() bool {
		if webJobs != d.webJobs {
			return false
		}
		if sireniaState.Primary == nil {
			return false
		}
		if d.sireniaJobs > 1 && sireniaState.Sync == nil {
			return false
		}
		if d.sireniaJobs > 2 && len(sireniaState.Async) != d.sireniaJobs-2 {
			return false
		}
		return true
	}
	for {
		if ready() {
			break
		}
		select {
		case s := <-stateCh:
			t.Assert(s.err, c.IsNil)
			sireniaState = *s.state
		case e, ok := <-jobEvents:
			if !ok {
				t.Fatalf("job event stream closed: %s", jobStream.Err())
			}
			debugf(t, "got job event: %s %s %s", e.Type, e.ID, e.State)
			if e.Type == "web" && e.State == ct.JobStateUp {
				webJobs++
			}
		case <-time.After(30 * time.Second):
			t.Fatal("timed out waiting for formation")
		}
	}

	// wait for the primary to indicate downstream replication sync
	debug(t, "waiting for primary to indicate downstream replication sync")
	sireniaClient := sc.NewClient(sireniaState.Primary.Addr)
	t.Assert(sireniaClient.WaitForReplSync(sireniaState.Sync, 1*time.Minute), c.IsNil)

	// connect to the db so we can test writes
	d.db.initDb(t, release, d)

	// check currently writeable
	d.db.assertWriteable(t, release, d)

	// check a deploy completes with expected cluster state changes
	release.ID = ""
	t.Assert(client.CreateRelease(release), c.IsNil)
	newRelease := release.ID
	deployment, err := client.CreateDeployment(app.ID, newRelease)
	t.Assert(err, c.IsNil)
	deployEvents := make(chan *ct.DeploymentEvent)
	deployStream, err := client.StreamDeployment(deployment, deployEvents)
	t.Assert(err, c.IsNil)
	defer deployStream.Close()

	// assertNextState checks that the next state received is in the remaining states
	// that were expected, so handles the fact that some states don't happen, but the
	// states that do happen are expected and in-order.
	assertNextState := func(remaining []expectedSireniaState) int {
		var state state.State
	loop:
		for {
			select {
			case s := <-stateCh:
				t.Assert(s.err, c.IsNil)
				if len(s.state.Async) < d.expectedAsyncs() {
					// we shouldn't usually receive states with less asyncs than
					// expected, but they can occur as an intermediate state between
					// two expected states (e.g. when a sync does a takeover at the
					// same time as a new async is started) so just ignore them.
					debug(t, "ignoring state with too few asyncs")
					continue
				}
				state = *s.state
				break loop
			case <-time.After(60 * time.Second):
				t.Fatal("timed out waiting for cluster state")
			}
		}
		if state.Primary == nil {
			t.Fatal("no primary configured")
		}
		log := func(format string, v ...interface{}) {
			debugf(t, "skipping expected state: %s", fmt.Sprintf(format, v...))
		}
	outer:
		for i, expected := range remaining {
			if state.Primary.Meta["FLYNN_RELEASE_ID"] != expected.Primary {
				log("primary has incorrect release")
				continue
			}
			if state.Sync == nil {
				if expected.Sync == "" {
					return i
				}
				log("state has no sync node")
				continue
			}
			if state.Sync.Meta["FLYNN_RELEASE_ID"] != expected.Sync {
				log("sync has incorrect release")
				continue
			}
			if state.Async == nil {
				if expected.Async == nil {
					return i
				}
				log("state has no async nodes")
				continue
			}
			if len(state.Async) != len(expected.Async) {
				log("expected %d asyncs, got %d", len(expected.Async), len(state.Async))
				continue
			}
			for i, release := range expected.Async {
				if state.Async[i].Meta["FLYNN_RELEASE_ID"] != release {
					log("async[%d] has incorrect release", i)
					continue outer
				}
			}
			return i
		}
		t.Fatal("unexpected state")
		return -1
	}
	expected := d.expected(oldRelease, newRelease)
	var expectedIndex, newWebJobs int
loop:
	for {
		select {
		case e, ok := <-deployEvents:
			if !ok {
				t.Fatal("unexpected close of deployment event stream")
			}
			switch e.Status {
			case "complete":
				break loop
			case "failed":
				t.Fatalf("deployment failed: %s", e.Error)
			}
			debugf(t, "got deployment event: %s %s", e.JobType, e.JobState)
			if e.JobState != ct.JobStateUp && e.JobState != ct.JobStateDown {
				continue
			}
			switch e.JobType {
			case procName:
				// move on if we have seen all the expected events
				if expectedIndex >= len(expected) {
					continue
				}
				skipped := assertNextState(expected[expectedIndex:])
				expectedIndex += 1 + skipped
			case "web":
				if e.JobState == ct.JobStateUp && e.ReleaseID == newRelease {
					newWebJobs++
				}
			}
		case <-time.After(2 * time.Minute):
			t.Fatal("timed out waiting for deployment")
		}
	}

	// check we have the correct number of new web jobs
	t.Assert(newWebJobs, c.Equals, d.webJobs)

	// check writeable now deploy is complete
	d.db.assertWriteable(t, release, d)
}
Example #22
0
func (s *HostUpdateSuite) TestUpdateLogs(t *c.C) {
	if testCluster == nil {
		t.Skip("cannot boot new hosts")
	}

	instance := s.addHost(t)
	defer s.removeHost(t, instance)
	httpClient := &http.Client{Transport: &http.Transport{Dial: dialer.Retry.Dial}}
	client := cluster.NewHost(instance.ID, fmt.Sprintf("http://%s:1113", instance.IP), httpClient)

	// start partial logger job
	cmd := exec.JobUsingHost(
		client,
		exec.DockerImage(imageURIs["test-apps"]),
		&host.Job{
			Config: host.ContainerConfig{Cmd: []string{"/bin/partial-logger"}},
			Metadata: map[string]string{
				"flynn-controller.app": "partial-logger",
			},
		},
	)
	t.Assert(cmd.Start(), c.IsNil)
	defer cmd.Kill()

	// wait for partial line
	_, err := s.discoverdClient(t).Instances("partial-logger", 10*time.Second)
	t.Assert(err, c.IsNil)

	// update flynn-host
	pid, err := client.Update("/usr/local/bin/flynn-host", "daemon", "--id", cmd.HostID)
	t.Assert(err, c.IsNil)
	// update the pid file so removeHost works
	t.Assert(instance.Run(fmt.Sprintf("echo -n %d | sudo tee /var/run/flynn-host.pid", pid), nil), c.IsNil)

	// finish logging
	t.Assert(client.SignalJob(cmd.Job.ID, int(syscall.SIGUSR1)), c.IsNil)

	// check we get a single log line
	logc, err := logaggc.New("")
	t.Assert(err, c.IsNil)
	log, err := logc.GetLog("partial-logger", &logaggc.LogOpts{Follow: true})
	t.Assert(err, c.IsNil)
	defer log.Close()
	msgs := make(chan *logaggc.Message)
	go func() {
		defer close(msgs)
		dec := json.NewDecoder(log)
		for {
			var msg logaggc.Message
			if err := dec.Decode(&msg); err != nil {
				debugf(t, "error decoding message: %s", err)
				return
			}
			msgs <- &msg
		}
	}()
	for {
		select {
		case msg, ok := <-msgs:
			if !ok {
				t.Fatal("error getting log")
			}
			if msg.Stream == "stdout" {
				t.Assert(msg.Msg, c.Equals, "hello world")
				return
			}
		case <-time.After(10 * time.Second):
			t.Fatal("timed out waiting for log")
		}
	}
}
Example #23
0
func (s *DomainMigrationSuite) migrateDomain(t *c.C, dm *ct.DomainMigration) {
	debugf(t, "migrating domain from %s to %s", dm.OldDomain, dm.Domain)
	client := s.controllerClient(t)

	events := make(chan *ct.Event)
	stream, err := client.StreamEvents(controller.StreamEventsOptions{
		ObjectTypes: []ct.EventType{ct.EventTypeDomainMigration},
	}, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	prevRouterRelease, err := client.GetAppRelease("router")
	t.Assert(err, c.IsNil)

	err = client.PutDomain(dm)
	t.Assert(err, c.IsNil)

	waitEvent := func(typ string, timeout time.Duration) (event ct.DomainMigrationEvent) {
		debugf(t, "waiting for %s domain migration event", typ)
		var e *ct.Event
		var ok bool
		select {
		case e, ok = <-events:
			if !ok {
				t.Fatal("event stream closed unexpectedly")
			}
			debugf(t, "got %s domain migration event", typ)
		case <-time.After(timeout):
			t.Fatalf("timed out waiting for %s domain migration event", typ)
		}
		t.Assert(e.Data, c.NotNil)
		t.Assert(json.Unmarshal(e.Data, &event), c.IsNil)
		return
	}

	// created
	event := waitEvent("initial", 2*time.Minute)
	t.Assert(event.Error, c.Equals, "")
	t.Assert(event.DomainMigration, c.NotNil)
	t.Assert(event.DomainMigration.ID, c.Equals, dm.ID)
	t.Assert(event.DomainMigration.OldDomain, c.Equals, dm.OldDomain)
	t.Assert(event.DomainMigration.Domain, c.Equals, dm.Domain)
	t.Assert(event.DomainMigration.TLSCert, c.IsNil)
	t.Assert(event.DomainMigration.OldTLSCert, c.NotNil)
	t.Assert(event.DomainMigration.CreatedAt, c.NotNil)
	t.Assert(event.DomainMigration.CreatedAt.Equal(*dm.CreatedAt), c.Equals, true)
	t.Assert(event.DomainMigration.FinishedAt, c.IsNil)

	// complete
	event = waitEvent("final", 3*time.Minute)
	t.Assert(event.Error, c.Equals, "")
	t.Assert(event.DomainMigration, c.NotNil)
	t.Assert(event.DomainMigration.ID, c.Equals, dm.ID)
	t.Assert(event.DomainMigration.OldDomain, c.Equals, dm.OldDomain)
	t.Assert(event.DomainMigration.Domain, c.Equals, dm.Domain)
	t.Assert(event.DomainMigration.TLSCert, c.NotNil)
	t.Assert(event.DomainMigration.OldTLSCert, c.NotNil)
	t.Assert(event.DomainMigration.CreatedAt, c.NotNil)
	t.Assert(event.DomainMigration.CreatedAt.Equal(*dm.CreatedAt), c.Equals, true)
	t.Assert(event.DomainMigration.FinishedAt, c.NotNil)

	cert := event.DomainMigration.TLSCert

	controllerRelease, err := client.GetAppRelease("controller")
	t.Assert(err, c.IsNil)
	t.Assert(controllerRelease.Env["DEFAULT_ROUTE_DOMAIN"], c.Equals, dm.Domain)
	t.Assert(controllerRelease.Env["CA_CERT"], c.Equals, cert.CACert)

	routerRelease, err := client.GetAppRelease("router")
	t.Assert(err, c.IsNil)
	t.Assert(routerRelease.Env["TLSCERT"], c.Equals, cert.Cert)
	t.Assert(routerRelease.Env["TLSKEY"], c.Not(c.Equals), "")
	t.Assert(routerRelease.Env["TLSKEY"], c.Not(c.Equals), prevRouterRelease.Env["TLSKEY"])

	dashboardRelease, err := client.GetAppRelease("dashboard")
	t.Assert(err, c.IsNil)
	t.Assert(dashboardRelease.Env["DEFAULT_ROUTE_DOMAIN"], c.Equals, dm.Domain)
	t.Assert(dashboardRelease.Env["CONTROLLER_DOMAIN"], c.Equals, fmt.Sprintf("controller.%s", dm.Domain))
	t.Assert(dashboardRelease.Env["URL"], c.Equals, fmt.Sprintf("dashboard.%s", dm.Domain))
	t.Assert(dashboardRelease.Env["CA_CERT"], c.Equals, cert.CACert)

	var doPing func(string, int)
	doPing = func(component string, retriesRemaining int) {
		url := fmt.Sprintf("http://%s.%s/ping", component, dm.Domain)
		res, err := (&http.Client{}).Get(url)
		if (err != nil || res.StatusCode != 200) && retriesRemaining > 0 {
			time.Sleep(100 * time.Millisecond)
			doPing(component, retriesRemaining-1)
			return
		}
		t.Assert(err, c.IsNil)
		t.Assert(res.StatusCode, c.Equals, 200, c.Commentf("failed to ping %s", component))
	}
	doPing("controller", 3)
	doPing("dashboard", 3)
}
Example #24
0
func (s *SchedulerSuite) TestDeployController(t *c.C) {
	// get the current controller release
	client := s.controllerClient(t)
	app, err := client.GetApp("controller")
	t.Assert(err, c.IsNil)
	release, err := client.GetAppRelease(app.ID)
	t.Assert(err, c.IsNil)

	// get the current controller formation
	formation, err := client.GetFormation(app.ID, release.ID)
	t.Assert(err, c.IsNil)

	// create a controller deployment
	release.ID = ""
	t.Assert(client.CreateRelease(release), c.IsNil)
	deployment, err := client.CreateDeployment(app.ID, release.ID)
	t.Assert(err, c.IsNil)

	events := make(chan *ct.DeploymentEvent)
	eventStream, err := client.StreamDeployment(deployment, events)
	t.Assert(err, c.IsNil)
	defer eventStream.Close()

	// wait for the deploy to complete (this doesn't wait for specific events
	// due to the fact that when the deployer deploys itself, some events will
	// not get sent)
loop:
	for {
		select {
		case e, ok := <-events:
			if !ok {
				t.Fatal("unexpected close of deployment event stream")
			}
			debugf(t, "got deployment event: %s %s", e.JobType, e.JobState)
			switch e.Status {
			case "complete":
				break loop
			case "failed":
				t.Fatal("the deployment failed")
			}
		case <-time.After(time.Duration(app.DeployTimeout) * time.Second):
			t.Fatal("timed out waiting for the deploy to complete")
		}
	}

	// check the correct controller jobs are running
	hosts, err := s.clusterClient(t).Hosts()
	t.Assert(err, c.IsNil)
	t.Assert(hosts, c.Not(c.HasLen), 0)
	actual := make(map[string]map[string]int)
	for _, h := range hosts {
		jobs, err := h.ListJobs()
		t.Assert(err, c.IsNil)
		for _, job := range jobs {
			if job.Status != host.StatusRunning {
				continue
			}
			appID := job.Job.Metadata["flynn-controller.app"]
			if appID != app.ID {
				continue
			}
			releaseID := job.Job.Metadata["flynn-controller.release"]
			if _, ok := actual[releaseID]; !ok {
				actual[releaseID] = make(map[string]int)
			}
			typ := job.Job.Metadata["flynn-controller.type"]
			actual[releaseID][typ]++
		}
	}
	expected := map[string]map[string]int{release.ID: {
		"web":       formation.Processes["web"],
		"worker":    formation.Processes["worker"],
		"scheduler": len(hosts),
	}}
	t.Assert(actual, c.DeepEquals, expected)
}
Example #25
0
func (s *SchedulerSuite) TestRollbackController(t *c.C) {
	// get the current controller release
	client := s.controllerClient(t)
	app, err := client.GetApp("controller")
	t.Assert(err, c.IsNil)
	release, err := client.GetAppRelease(app.ID)
	t.Assert(err, c.IsNil)

	watcher, err := s.controllerClient(t).WatchJobEvents(app.ID, release.ID)
	t.Assert(err, c.IsNil)
	defer watcher.Close()

	// get the current controller formation
	formation, err := client.GetFormation(app.ID, release.ID)
	t.Assert(err, c.IsNil)

	currentReleaseID := release.ID

	// create a controller deployment that will fail
	release.ID = ""
	worker := release.Processes["worker"]
	worker.Entrypoint = []string{"/i/dont/exist"}
	release.Processes["worker"] = worker
	t.Assert(client.CreateRelease(release), c.IsNil)
	deployment, err := client.CreateDeployment(app.ID, release.ID)
	t.Assert(err, c.IsNil)

	events := make(chan *ct.DeploymentEvent)
	eventStream, err := client.StreamDeployment(deployment, events)
	t.Assert(err, c.IsNil)
	defer eventStream.Close()

	// wait for the deploy to fail
loop:
	for {
		select {
		case e, ok := <-events:
			if !ok {
				t.Fatal("unexpected close of deployment event stream")
			}
			debugf(t, "got deployment event: %s %s", e.JobType, e.JobState)
			switch e.Status {
			case "complete":
				t.Fatal("the deployment succeeded when it should have failed")
			case "failed":
				break loop
			}
		case <-time.After(2 * time.Minute):
			t.Fatal("timed out waiting for the deploy to fail")
		}
	}

	// wait for jobs to come back up
	hosts, err := s.clusterClient(t).Hosts()
	expected := map[string]map[ct.JobState]int{
		"web":       {ct.JobStateUp: formation.Processes["web"]},
		"scheduler": {ct.JobStateUp: len(hosts)},
	}
	t.Assert(watcher.WaitFor(expected, scaleTimeout, nil), c.IsNil)

	// check the correct controller jobs are running
	t.Assert(err, c.IsNil)
	t.Assert(hosts, c.Not(c.HasLen), 0)
	actual := make(map[string]map[string]int)
	for _, h := range hosts {
		jobs, err := h.ListJobs()
		t.Assert(err, c.IsNil)
		for _, job := range jobs {
			if job.Status != host.StatusRunning {
				continue
			}
			appID := job.Job.Metadata["flynn-controller.app"]
			if appID != app.ID {
				continue
			}
			releaseID := job.Job.Metadata["flynn-controller.release"]
			if releaseID != currentReleaseID {
				continue
			}
			if _, ok := actual[releaseID]; !ok {
				actual[releaseID] = make(map[string]int)
			}
			typ := job.Job.Metadata["flynn-controller.type"]
			actual[releaseID][typ]++
		}
	}
	t.Assert(actual, c.DeepEquals, map[string]map[string]int{
		currentReleaseID: {
			"web":       formation.Processes["web"],
			"scheduler": formation.Processes["scheduler"] * len(hosts),
			"worker":    formation.Processes["worker"],
		},
	})
}
Example #26
0
func (s *SchedulerSuite) TestDeployController(t *c.C) {
	if testCluster == nil {
		t.Skip("cannot determine test cluster size")
	}

	// get the current controller release
	client := s.controllerClient(t)
	app, err := client.GetApp("controller")
	t.Assert(err, c.IsNil)
	release, err := client.GetAppRelease(app.ID)
	t.Assert(err, c.IsNil)

	// create a controller deployment
	release.ID = ""
	t.Assert(client.CreateRelease(release), c.IsNil)
	deployment, err := client.CreateDeployment(app.ID, release.ID)
	t.Assert(err, c.IsNil)

	// use a function to create the event stream as a new stream will be needed
	// after deploying the controller
	var events chan *ct.DeploymentEvent
	var eventStream stream.Stream
	connectStream := func() {
		events = make(chan *ct.DeploymentEvent)
		err := attempt.Strategy{
			Total: 10 * time.Second,
			Delay: 500 * time.Millisecond,
		}.Run(func() (err error) {
			eventStream, err = client.StreamDeployment(deployment.ID, events)
			return
		})
		t.Assert(err, c.IsNil)
	}
	connectStream()
	defer eventStream.Close()

	// wait for the deploy to complete (this doesn't wait for specific events
	// due to the fact that when the deployer deploys itself, some events will
	// not get sent)
loop:
	for {
		select {
		case e, ok := <-events:
			if !ok {
				// reconnect the stream as it may of been closed
				// due to the controller being deployed
				debug(t, "reconnecting deployment event stream")
				connectStream()
				continue
			}
			debugf(t, "got deployment event: %s %s", e.JobType, e.JobState)
			switch e.Status {
			case "complete":
				break loop
			case "failed":
				t.Fatal("the deployment failed")
			}
		case <-time.After(60 * time.Second):
			t.Fatal("timed out waiting for the deploy to complete")
		}
	}

	// check the correct controller jobs are running
	hosts, err := s.clusterClient(t).ListHosts()
	t.Assert(err, c.IsNil)
	actual := make(map[string]map[string]int)
	for _, host := range hosts {
		for _, job := range host.Jobs {
			appID := job.Metadata["flynn-controller.app"]
			if appID != app.ID {
				continue
			}
			releaseID := job.Metadata["flynn-controller.release"]
			if _, ok := actual[releaseID]; !ok {
				actual[releaseID] = make(map[string]int)
			}
			typ := job.Metadata["flynn-controller.type"]
			actual[releaseID][typ]++
		}
	}
	expected := map[string]map[string]int{release.ID: {
		"web":       2,
		"deployer":  2,
		"scheduler": testCluster.Size(),
	}}
	t.Assert(actual, c.DeepEquals, expected)
}
Example #27
0
func (s *SchedulerSuite) TestControllerRestart(t *c.C) {
	// get the current controller details
	app, err := s.controllerClient(t).GetApp("controller")
	t.Assert(err, c.IsNil)
	release, err := s.controllerClient(t).GetAppRelease("controller")
	t.Assert(err, c.IsNil)
	formation, err := s.controllerClient(t).GetFormation(app.ID, release.ID)
	t.Assert(err, c.IsNil)
	list, err := s.controllerClient(t).JobList("controller")
	t.Assert(err, c.IsNil)
	var jobs []*ct.Job
	for _, job := range list {
		if job.Type == "web" && job.State == ct.JobStateUp {
			jobs = append(jobs, job)
		}
	}
	t.Assert(jobs, c.HasLen, formation.Processes["web"])
	jobID := jobs[0].ID
	hostID, _ := cluster.ExtractHostID(jobID)
	t.Assert(hostID, c.Not(c.Equals), "")
	debugf(t, "current controller app[%s] host[%s] job[%s]", app.ID, hostID, jobID)

	// subscribe to service events, wait for current event
	events := make(chan *discoverd.Event)
	stream, err := s.discoverdClient(t).Service("controller").Watch(events)
	t.Assert(err, c.IsNil)
	defer stream.Close()
	type serviceEvents map[discoverd.EventKind]int
	wait := func(expected serviceEvents) {
		actual := make(serviceEvents)
	outer:
		for {
			select {
			case event := <-events:
				actual[event.Kind]++
				for kind, count := range expected {
					if actual[kind] != count {
						continue outer
					}
				}
				return
			case <-time.After(scaleTimeout):
				t.Fatal("timed out waiting for controller service event")
			}
		}
	}
	wait(serviceEvents{discoverd.EventKindCurrent: 1})

	// start another controller and wait for it to come up
	debug(t, "scaling the controller up")
	formation.Processes["web"]++
	t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil)
	wait(serviceEvents{discoverd.EventKindUp: 1})

	// kill the first controller and check the scheduler brings it back online
	cc := cluster.NewClientWithServices(s.discoverdClient(t).Service)
	hc, err := cc.Host(hostID)
	t.Assert(err, c.IsNil)
	debug(t, "stopping job ", jobID)
	t.Assert(hc.StopJob(jobID), c.IsNil)
	wait(serviceEvents{discoverd.EventKindUp: 1, discoverd.EventKindDown: 1})

	// scale back down
	debug(t, "scaling the controller down")
	formation.Processes["web"]--
	t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil)
	wait(serviceEvents{discoverd.EventKindDown: 1})

	// unset the suite's client so other tests use a new client
	s.controller = nil
}
Example #28
0
func (s *CLISuite) TestSlugReleaseGarbageCollection(t *c.C) {
	client := s.controllerClient(t)

	// create app with gc.max_inactive_slug_releases=3
	maxInactiveSlugReleases := 3
	app := &ct.App{Meta: map[string]string{"gc.max_inactive_slug_releases": strconv.Itoa(maxInactiveSlugReleases)}}
	t.Assert(client.CreateApp(app), c.IsNil)

	// create an image artifact
	imageArtifact := &ct.Artifact{Type: host.ArtifactTypeDocker, URI: imageURIs["test-apps"]}
	t.Assert(client.CreateArtifact(imageArtifact), c.IsNil)

	// create 5 slug artifacts
	var slug bytes.Buffer
	gz := gzip.NewWriter(&slug)
	t.Assert(tar.NewWriter(gz).Close(), c.IsNil)
	t.Assert(gz.Close(), c.IsNil)
	slugs := []string{
		"http://blobstore.discoverd/1/slug.tgz",
		"http://blobstore.discoverd/2/slug.tgz",
		"http://blobstore.discoverd/3/slug.tgz",
		"http://blobstore.discoverd/4/slug.tgz",
		"http://blobstore.discoverd/5/slug.tgz",
	}
	slugArtifacts := make([]*ct.Artifact, len(slugs))
	for i, uri := range slugs {
		req, err := http.NewRequest("PUT", uri, bytes.NewReader(slug.Bytes()))
		t.Assert(err, c.IsNil)
		res, err := http.DefaultClient.Do(req)
		t.Assert(err, c.IsNil)
		res.Body.Close()
		t.Assert(res.StatusCode, c.Equals, http.StatusOK)
		artifact := &ct.Artifact{
			Type: host.ArtifactTypeFile,
			URI:  uri,
			Meta: map[string]string{"blobstore": "true"},
		}
		t.Assert(client.CreateArtifact(artifact), c.IsNil)
		slugArtifacts[i] = artifact
	}

	// create 6 releases, the second being scaled up and having the
	// same slug as the third (so prevents the slug being deleted)
	releases := make([]*ct.Release, 6)
	for i, r := range []struct {
		slug   *ct.Artifact
		active bool
	}{
		{slugArtifacts[0], false},
		{slugArtifacts[1], true},
		{slugArtifacts[1], false},
		{slugArtifacts[2], false},
		{slugArtifacts[3], false},
		{slugArtifacts[4], false},
	} {
		release := &ct.Release{
			ArtifactIDs: []string{imageArtifact.ID, r.slug.ID},
			Processes: map[string]ct.ProcessType{
				"app": {Cmd: []string{"/bin/pingserv"}, Ports: []ct.Port{{Proto: "tcp"}}},
			},
		}
		t.Assert(client.CreateRelease(release), c.IsNil)
		procs := map[string]int{"app": 0}
		if r.active {
			procs["app"] = 1
		}
		t.Assert(client.PutFormation(&ct.Formation{
			AppID:     app.ID,
			ReleaseID: release.ID,
			Processes: procs,
		}), c.IsNil)
		releases[i] = release
	}

	// scale the last release so we can deploy it
	lastRelease := releases[len(releases)-1]
	watcher, err := client.WatchJobEvents(app.ID, lastRelease.ID)
	t.Assert(err, c.IsNil)
	defer watcher.Close()
	t.Assert(client.PutFormation(&ct.Formation{
		AppID:     app.ID,
		ReleaseID: lastRelease.ID,
		Processes: map[string]int{"app": 1},
	}), c.IsNil)
	t.Assert(watcher.WaitFor(ct.JobEvents{"app": ct.JobUpEvents(1)}, scaleTimeout, nil), c.IsNil)
	t.Assert(client.SetAppRelease(app.ID, lastRelease.ID), c.IsNil)

	// subscribe to garbage collection events
	gcEvents := make(chan *ct.Event)
	stream, err := client.StreamEvents(ct.StreamEventsOptions{
		AppID:       app.ID,
		ObjectTypes: []ct.EventType{ct.EventTypeAppGarbageCollection},
	}, gcEvents)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	// deploy a new release with the same slug as the last release
	newRelease := *lastRelease
	newRelease.ID = ""
	t.Assert(client.CreateRelease(&newRelease), c.IsNil)
	t.Assert(client.DeployAppRelease(app.ID, newRelease.ID), c.IsNil)

	// wait for garbage collection
	select {
	case event, ok := <-gcEvents:
		if !ok {
			t.Fatalf("event stream closed unexpectedly: %s", stream.Err())
		}
		var e ct.AppGarbageCollectionEvent
		t.Assert(json.Unmarshal(event.Data, &e), c.IsNil)
		if e.Error != "" {
			t.Fatalf("garbage collection failed: %s", e.Error)
		}
	case <-time.After(60 * time.Second):
		t.Fatal("timed out waiting for garbage collection")
	}

	// check we have 4 distinct slug releases (so 5 in total, only 3 are
	// inactive)
	list, err := client.AppReleaseList(app.ID)
	t.Assert(err, c.IsNil)
	t.Assert(list, c.HasLen, maxInactiveSlugReleases+2)
	distinctSlugs := make(map[string]struct{}, len(list))
	for _, release := range list {
		files := release.FileArtifactIDs()
		t.Assert(files, c.HasLen, 1)
		distinctSlugs[files[0]] = struct{}{}
	}
	t.Assert(distinctSlugs, c.HasLen, maxInactiveSlugReleases+1)

	// check the first and third releases got deleted, but the rest remain
	assertDeleted := func(release *ct.Release, deleted bool) {
		_, err := client.GetRelease(release.ID)
		if deleted {
			t.Assert(err, c.Equals, controller.ErrNotFound)
		} else {
			t.Assert(err, c.IsNil)
		}
	}
	assertDeleted(releases[0], true)
	assertDeleted(releases[1], false)
	assertDeleted(releases[2], true)
	assertDeleted(releases[3], false)
	assertDeleted(releases[4], false)
	assertDeleted(releases[5], false)
	assertDeleted(&newRelease, false)

	// check the first slug got deleted, but the rest remain
	s.assertURI(t, slugs[0], http.StatusNotFound)
	for i := 1; i < len(slugs); i++ {
		s.assertURI(t, slugs[i], http.StatusOK)
	}
}
Example #29
0
func (s *PostgresSuite) testDeploy(t *c.C, d *pgDeploy) {
	// create postgres app
	client := s.controllerClient(t)
	app := &ct.App{Name: d.name, Strategy: "postgres"}
	t.Assert(client.CreateApp(app), c.IsNil)

	// copy release from default postgres app
	release, err := client.GetAppRelease("postgres")
	t.Assert(err, c.IsNil)
	release.ID = ""
	proc := release.Processes["postgres"]
	delete(proc.Env, "SINGLETON")
	proc.Env["FLYNN_POSTGRES"] = d.name
	proc.Service = d.name
	release.Processes["postgres"] = proc
	t.Assert(client.CreateRelease(release), c.IsNil)
	t.Assert(client.SetAppRelease(app.ID, release.ID), c.IsNil)
	oldRelease := release.ID

	// create formation
	discEvents := make(chan *discoverd.Event)
	discStream, err := s.discoverdClient(t).Service(d.name).Watch(discEvents)
	t.Assert(err, c.IsNil)
	defer discStream.Close()
	jobEvents := make(chan *ct.Job)
	jobStream, err := client.StreamJobEvents(d.name, jobEvents)
	t.Assert(err, c.IsNil)
	defer jobStream.Close()
	t.Assert(client.PutFormation(&ct.Formation{
		AppID:     app.ID,
		ReleaseID: release.ID,
		Processes: map[string]int{"postgres": d.pgJobs, "web": d.webJobs},
	}), c.IsNil)

	// watch cluster state changes
	type stateChange struct {
		state *state.State
		err   error
	}
	stateCh := make(chan stateChange)
	go func() {
		for event := range discEvents {
			if event.Kind != discoverd.EventKindServiceMeta {
				continue
			}
			var state state.State
			if err := json.Unmarshal(event.ServiceMeta.Data, &state); err != nil {
				stateCh <- stateChange{err: err}
				return
			}
			primary := ""
			if state.Primary != nil {
				primary = state.Primary.Addr
			}
			sync := ""
			if state.Sync != nil {
				sync = state.Sync.Addr
			}
			var async []string
			for _, a := range state.Async {
				async = append(async, a.Addr)
			}
			debugf(t, "got pg cluster state: index=%d primary=%s sync=%s async=%s",
				event.ServiceMeta.Index, primary, sync, strings.Join(async, ","))
			stateCh <- stateChange{state: &state}
		}
	}()

	// wait for correct cluster state and number of web processes
	var pgState state.State
	var webJobs int
	ready := func() bool {
		if webJobs != d.webJobs {
			return false
		}
		if pgState.Primary == nil {
			return false
		}
		if d.pgJobs > 1 && pgState.Sync == nil {
			return false
		}
		if d.pgJobs > 2 && len(pgState.Async) != d.pgJobs-2 {
			return false
		}
		return true
	}
	for {
		if ready() {
			break
		}
		select {
		case s := <-stateCh:
			t.Assert(s.err, c.IsNil)
			pgState = *s.state
		case e, ok := <-jobEvents:
			if !ok {
				t.Fatalf("job event stream closed: %s", jobStream.Err())
			}
			debugf(t, "got job event: %s %s %s", e.Type, e.ID, e.State)
			if e.Type == "web" && e.State == "up" {
				webJobs++
			}
		case <-time.After(30 * time.Second):
			t.Fatal("timed out waiting for postgres formation")
		}
	}

	// connect to the db so we can test writes
	db := postgres.Wait(d.name, fmt.Sprintf("dbname=postgres user=flynn password=%s", release.Env["PGPASSWORD"]))
	dbname := "deploy-test"
	t.Assert(db.Exec(fmt.Sprintf(`CREATE DATABASE "%s" WITH OWNER = "flynn"`, dbname)), c.IsNil)
	db.Close()
	db, err = postgres.Open(d.name, fmt.Sprintf("dbname=%s user=flynn password=%s", dbname, release.Env["PGPASSWORD"]))
	t.Assert(err, c.IsNil)
	defer db.Close()
	t.Assert(db.Exec(`CREATE TABLE deploy_test ( data text)`), c.IsNil)
	assertWriteable := func() {
		debug(t, "writing to postgres database")
		t.Assert(db.Exec(`INSERT INTO deploy_test (data) VALUES ('data')`), c.IsNil)
	}

	// check currently writeable
	assertWriteable()

	// check a deploy completes with expected cluster state changes
	release.ID = ""
	t.Assert(client.CreateRelease(release), c.IsNil)
	newRelease := release.ID
	deployment, err := client.CreateDeployment(app.ID, newRelease)
	t.Assert(err, c.IsNil)
	deployEvents := make(chan *ct.DeploymentEvent)
	deployStream, err := client.StreamDeployment(deployment, deployEvents)
	t.Assert(err, c.IsNil)
	defer deployStream.Close()

	// assertNextState checks that the next state received is in the remaining states
	// that were expected, so handles the fact that some states don't happen, but the
	// states that do happen are expected and in-order.
	assertNextState := func(remaining []expectedPgState) int {
		var state state.State
	loop:
		for {
			select {
			case s := <-stateCh:
				t.Assert(s.err, c.IsNil)
				if len(s.state.Async) < d.expectedAsyncs() {
					// we shouldn't usually receive states with less asyncs than
					// expected, but they can occur as an intermediate state between
					// two expected states (e.g. when a sync does a takeover at the
					// same time as a new async is started) so just ignore them.
					debug(t, "ignoring state with too few asyncs")
					continue
				}
				state = *s.state
				break loop
			case <-time.After(60 * time.Second):
				t.Fatal("timed out waiting for postgres cluster state")
			}
		}
		if state.Primary == nil {
			t.Fatal("no primary configured")
		}
		log := func(format string, v ...interface{}) {
			debugf(t, "skipping expected state: %s", fmt.Sprintf(format, v...))
		}
	outer:
		for i, expected := range remaining {
			if state.Primary.Meta["FLYNN_RELEASE_ID"] != expected.Primary {
				log("primary has incorrect release")
				continue
			}
			if state.Sync == nil {
				if expected.Sync == "" {
					return i
				}
				log("state has no sync node")
				continue
			}
			if state.Sync.Meta["FLYNN_RELEASE_ID"] != expected.Sync {
				log("sync has incorrect release")
				continue
			}
			if state.Async == nil {
				if expected.Async == nil {
					return i
				}
				log("state has no async nodes")
				continue
			}
			if len(state.Async) != len(expected.Async) {
				log("expected %d asyncs, got %d", len(expected.Async), len(state.Async))
				continue
			}
			for i, release := range expected.Async {
				if state.Async[i].Meta["FLYNN_RELEASE_ID"] != release {
					log("async[%d] has incorrect release", i)
					continue outer
				}
			}
			return i
		}
		t.Fatal("unexpected pg state")
		return -1
	}
	expected := d.expected(oldRelease, newRelease)
	var expectedIndex, newWebJobs int
loop:
	for {
		select {
		case e, ok := <-deployEvents:
			if !ok {
				t.Fatal("unexpected close of deployment event stream")
			}
			switch e.Status {
			case "complete":
				break loop
			case "failed":
				t.Fatalf("deployment failed: %s", e.Error)
			}
			debugf(t, "got deployment event: %s %s", e.JobType, e.JobState)
			if e.JobState != "up" && e.JobState != "down" {
				continue
			}
			switch e.JobType {
			case "postgres":
				// move on if we have seen all the expected events
				if expectedIndex >= len(expected) {
					continue
				}
				skipped := assertNextState(expected[expectedIndex:])
				expectedIndex += 1 + skipped
			case "web":
				if e.JobState == "up" && e.ReleaseID == newRelease {
					newWebJobs++
				}
			}
		case <-time.After(2 * time.Minute):
			t.Fatal("timed out waiting for deployment")
		}
	}

	// check we have the correct number of new web jobs
	t.Assert(newWebJobs, c.Equals, d.webJobs)

	// check writeable now deploy is complete
	assertWriteable()
}