Esempio n. 1
0
func (r *Runner) getBuildLog(w http.ResponseWriter, req *http.Request, ps httprouter.Params) {
	id := ps.ByName("build")
	b := &Build{}
	if err := r.db.View(func(tx *bolt.Tx) error {
		v := tx.Bucket(dbBucket).Get([]byte(id))
		if err := json.Unmarshal(v, b); err != nil {
			return fmt.Errorf("could not decode build %s: %s", v, err)
		}
		return nil
	}); err != nil {
		http.Error(w, err.Error(), 500)
		return
	}

	// if it's a V1 build, redirect to the log in S3
	if b.Version == BuildVersion1 {
		http.Redirect(w, req, b.LogURL, http.StatusMovedPermanently)
		return
	}

	// if it's a browser, serve the build-log.html template
	if strings.Contains(req.Header.Get("Accept"), "text/html") {
		tpl, err := template.ParseFiles(path.Join(args.AssetsDir, "build-log.html"))
		if err != nil {
			http.Error(w, err.Error(), 500)
			return
		}
		w.Header().Set("Content-Type", "text/html; charset=utf-8")
		if err := tpl.Execute(w, b); err != nil {
			log.Printf("error executing build-log template: %s", err)
		}
		return
	}

	// serve the build log as either an SSE or plain text stream
	ch := make(chan string)
	stream, err := getBuildLogStream(b, ch)
	if err != nil {
		http.Error(w, err.Error(), 500)
		return
	}
	if cn, ok := w.(http.CloseNotifier); ok {
		go func() {
			<-cn.CloseNotify()
			stream.Close()
		}()
	} else {
		defer stream.Close()
	}

	if strings.Contains(req.Header.Get("Accept"), "text/event-stream") {
		sse.ServeStream(w, ch, nil)
	} else {
		servePlainStream(w, ch)
	}

	if err := stream.Err(); err != nil {
		log.Println("error serving build log stream:", err)
	}
}
Esempio n. 2
0
// DeleteApp deletes an app.
func (c *Client) DeleteApp(appID string) (*ct.AppDeletion, error) {
	events := make(chan *ct.AppEvent)
	stream, err := c.ResumingStream("GET", fmt.Sprintf("/apps/%s/events?object_type=%s", appID, ct.EventTypeAppDeletion), events)
	if err != nil {
		return nil, err
	}
	defer stream.Close()

	if err := c.Delete(fmt.Sprintf("/apps/%s", appID)); err != nil {
		return nil, err
	}

	select {
	case event, ok := <-events:
		if !ok {
			return nil, stream.Err()
		}
		var e ct.AppDeletionEvent
		if err := json.Unmarshal(event.Data, &e); err != nil {
			return nil, err
		}
		if e.Error != "" {
			return nil, errors.New(e.Error)
		}
		return e.AppDeletion, nil
	case <-time.After(60 * time.Second):
		return nil, errors.New("timed out waiting for app deletion")
	}
}
Esempio n. 3
0
// DeleteRelease deletes a release and any associated file artifacts.
func (c *Client) DeleteRelease(appID, releaseID string) (*ct.ReleaseDeletion, error) {
	events := make(chan *ct.Event)
	stream, err := c.StreamEvents(ct.StreamEventsOptions{
		AppID:       appID,
		ObjectID:    releaseID,
		ObjectTypes: []ct.EventType{ct.EventTypeReleaseDeletion},
	}, events)
	if err != nil {
		return nil, err
	}
	defer stream.Close()

	if err := c.Delete(fmt.Sprintf("/apps/%s/releases/%s", appID, releaseID), nil); err != nil {
		return nil, err
	}

	select {
	case event, ok := <-events:
		if !ok {
			return nil, stream.Err()
		}
		var e ct.ReleaseDeletionEvent
		if err := json.Unmarshal(event.Data, &e); err != nil {
			return nil, err
		}
		if e.Error != "" {
			return nil, errors.New(e.Error)
		}
		return e.ReleaseDeletion, nil
	case <-time.After(60 * time.Second):
		return nil, errors.New("timed out waiting for release deletion")
	}
}
Esempio n. 4
0
func (s *SchedulerSuite) TestJobMeta(t *c.C) {
	app, release := s.createApp(t)

	events := make(chan *ct.JobEvent)
	stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	// start 1 one-off job
	_, err = s.controllerClient(t).RunJobDetached(app.ID, &ct.NewJob{
		ReleaseID: release.ID,
		Cmd:       []string{"sh", "-c", "while true; do echo one-off-job; sleep 1; done"},
		Meta: map[string]string{
			"foo": "baz",
		},
	})
	t.Assert(err, c.IsNil)
	waitForJobEvents(t, stream, events, jobEvents{"": {"up": 1}})

	list, err := s.controllerClient(t).JobList(app.ID)
	t.Assert(err, c.IsNil)
	t.Assert(list, c.HasLen, 1)
	t.Assert(list[0].Meta, c.DeepEquals, map[string]string{
		"foo": "baz",
	})
}
Esempio n. 5
0
func (s *HostSuite) TestUpdateTags(t *c.C) {
	events := make(chan *discoverd.Event)
	stream, err := s.discoverdClient(t).Service("flynn-host").Watch(events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	nextEvent := func() *discoverd.Event {
		select {
		case e, ok := <-events:
			if !ok {
				t.Fatal("unexpected close of discoverd stream")
			}
			return e
		case <-time.After(10 * time.Second):
			t.Fatal("timed out waiting for discoverd event")
		}
		return nil
	}

	var client *cluster.Host
	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUp && client == nil {
			client = cluster.NewHost(e.Instance.Meta["id"], e.Instance.Addr, nil, nil)
		}
		if e.Kind == discoverd.EventKindCurrent {
			break
		}
	}
	if client == nil {
		t.Fatal("did not initialize flynn-host client")
	}

	t.Assert(client.UpdateTags(map[string]string{"foo": "bar"}), c.IsNil)

	var meta map[string]string
	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() {
			meta = e.Instance.Meta
			break
		}
	}
	t.Assert(meta["tag:foo"], c.Equals, "bar")

	// setting to empty string should delete the tag
	t.Assert(client.UpdateTags(map[string]string{"foo": ""}), c.IsNil)

	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() {
			meta = e.Instance.Meta
			break
		}
	}
	if _, ok := meta["tag:foo"]; ok {
		t.Fatal("expected tag to be deleted but is still present")
	}
}
Esempio n. 6
0
func (s *HostSuite) TestVolumeDeleteOnStop(t *c.C) {
	hosts, err := s.clusterClient(t).Hosts()
	t.Assert(err, c.IsNil)
	t.Assert(hosts, c.Not(c.HasLen), 0)
	h := hosts[0]

	// stream job events so we can wait for cleanup events
	events := make(chan *host.Event)
	stream, err := h.StreamEvents("all", events)
	t.Assert(err, c.IsNil)
	defer stream.Close()
	waitCleanup := func(jobID string) {
		timeout := time.After(30 * time.Second)
		for {
			select {
			case event := <-events:
				if event.JobID == jobID && event.Event == host.JobEventCleanup {
					return
				}
			case <-timeout:
				t.Fatal("timed out waiting for cleanup event")
			}
		}
	}

	for _, deleteOnStop := range []bool{true, false} {
		job := &host.Job{
			Config: host.ContainerConfig{
				Args:       []string{"sh", "-c", "ls -d /foo"},
				DisableLog: true,
			},
		}

		// provision a volume
		req := &ct.VolumeReq{Path: "/foo", DeleteOnStop: deleteOnStop}
		vol, err := utils.ProvisionVolume(req, h, job)
		t.Assert(err, c.IsNil)
		defer h.DestroyVolume(vol.ID)

		// run the job
		cmd := exec.JobUsingCluster(s.clusterClient(t), s.createArtifact(t, "test-apps"), job)
		cmd.HostID = h.ID()
		out, err := cmd.CombinedOutput()
		t.Assert(err, c.IsNil)
		t.Assert(string(out), c.Equals, "/foo\n")

		// wait for a cleanup event
		waitCleanup(job.ID)

		// check if the volume was deleted or not
		vol, err = h.GetVolume(vol.ID)
		if deleteOnStop {
			t.Assert(hh.IsObjectNotFoundError(err), c.Equals, true)
		} else {
			t.Assert(err, c.IsNil)
		}
	}
}
Esempio n. 7
0
func main() {
	defer shutdown.Exit()

	grohl.AddContext("app", "controller-scheduler")
	grohl.Log(grohl.Data{"at": "start"})

	go startHTTPServer()

	if period := os.Getenv("BACKOFF_PERIOD"); period != "" {
		var err error
		backoffPeriod, err = time.ParseDuration(period)
		if err != nil {
			shutdown.Fatal(err)
		}
		grohl.Log(grohl.Data{"at": "backoff_period", "period": backoffPeriod.String()})
	}

	cc, err := controller.NewClient("", os.Getenv("AUTH_KEY"))
	if err != nil {
		shutdown.Fatal(err)
	}
	c := newContext(cc, cluster.NewClient())

	c.watchHosts()

	grohl.Log(grohl.Data{"at": "leaderwait"})
	hb, err := discoverd.AddServiceAndRegister("controller-scheduler", ":"+os.Getenv("PORT"))
	if err != nil {
		shutdown.Fatal(err)
	}
	shutdown.BeforeExit(func() { hb.Close() })

	leaders := make(chan *discoverd.Instance)
	stream, err := discoverd.NewService("controller-scheduler").Leaders(leaders)
	if err != nil {
		shutdown.Fatal(err)
	}
	for leader := range leaders {
		if leader.Addr == hb.Addr() {
			break
		}
	}
	if err := stream.Err(); err != nil {
		// TODO: handle discoverd errors
		shutdown.Fatal(err)
	}
	stream.Close()
	// TODO: handle demotion

	grohl.Log(grohl.Data{"at": "leader"})

	// TODO: periodic full cluster sync for anti-entropy
	c.watchFormations()
}
Esempio n. 8
0
func (h *httpAPI) handleStream(w http.ResponseWriter, params httprouter.Params, kind discoverd.EventKind) {
	ch := make(chan *discoverd.Event, 64) // TODO: figure out how big this buffer should be
	stream := h.Store.Subscribe(params.ByName("service"), true, kind, ch)
	s := sse.NewStream(w, ch, nil)
	s.Serve()
	s.Wait()
	stream.Close()
	if err := stream.Err(); err != nil {
		s.CloseWithError(err)
	}
}
Esempio n. 9
0
func (s *HostSuite) TestAddFailingJob(t *c.C) {
	// get a host and watch events
	hosts, err := s.clusterClient(t).Hosts()
	t.Assert(err, c.IsNil)
	t.Assert(hosts, c.Not(c.HasLen), 0)
	h := hosts[0]
	jobID := random.UUID()
	events := make(chan *host.Event)
	stream, err := h.StreamEvents(jobID, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	// add a job with a non existent partition
	job := &host.Job{
		ID:         jobID,
		Mountspecs: []*host.Mountspec{{}},
		Partition:  "nonexistent",
	}
	t.Assert(h.AddJob(job), c.IsNil)

	// check we get a create then error event
	actual := make(map[host.JobEventType]*host.Event, 2)
loop:
	for {
		select {
		case e, ok := <-events:
			if !ok {
				t.Fatalf("job event stream closed unexpectedly: %s", stream.Err())
			}
			if _, ok := actual[e.Event]; ok {
				t.Fatalf("unexpected event: %v", e)
			}
			actual[e.Event] = e
			if len(actual) >= 2 {
				break loop
			}
		case <-time.After(30 * time.Second):
			t.Fatal("timed out waiting for job event")
		}
	}
	t.Assert(actual[host.JobEventCreate], c.NotNil)
	e := actual[host.JobEventError]
	t.Assert(e, c.NotNil)
	t.Assert(e.Job, c.NotNil)
	t.Assert(e.Job.Error, c.NotNil)
	t.Assert(*e.Job.Error, c.Equals, `host: invalid job partition "nonexistent"`)
}
Esempio n. 10
0
func (c *Client) DeployAppRelease(appID, releaseID string) error {
	d, err := c.CreateDeployment(appID, releaseID)
	if err != nil {
		return err
	}

	// if initial deploy, just stop here
	if d.FinishedAt != nil {
		return nil
	}

	events := make(chan *ct.DeploymentEvent)
	stream, err := c.StreamDeployment(d, events)
	if err != nil {
		return err
	}
	defer stream.Close()

	timeout := d.DeployTimeout
	if timeout == 0 {
		// although a non-zero timeout is set for all new apps, it
		// could still be zero in the case of updating a cluster which
		// doesn't have deploy timeouts set (as the controller
		// migration may not have run yet) so use the default
		timeout = ct.DefaultDeployTimeout
	}
outer:
	for {
		select {
		case e, ok := <-events:
			if !ok {
				return errors.New("unexpected close of deployment event stream")
			}
			switch e.Status {
			case "complete":
				break outer
			case "failed":
				return e.Err()
			}
		case <-time.After(time.Duration(timeout) * time.Second):
			return errors.New("timed out waiting for deployment completion")

		}
	}
	return nil
}
Esempio n. 11
0
func (s *SchedulerSuite) TestScale(t *c.C) {
	app, release := s.createApp(t)

	events := make(chan *ct.JobEvent)
	stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	formation := &ct.Formation{
		AppID:     app.ID,
		ReleaseID: release.ID,
		Processes: make(map[string]int),
	}

	current := make(map[string]int)
	updates := []map[string]int{
		{"printer": 2},
		{"printer": 3, "crasher": 1},
		{"printer": 1},
	}

	for _, procs := range updates {
		debugf(t, "scaling formation to %v", procs)
		formation.Processes = procs
		t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil)

		expected := make(jobEvents)
		for typ, count := range procs {
			diff := count - current[typ]
			if diff > 0 {
				expected[typ] = map[string]int{"up": diff}
			} else {
				expected[typ] = map[string]int{"down": -diff}
			}
		}
		for typ, count := range current {
			if _, ok := procs[typ]; !ok {
				expected[typ] = map[string]int{"down": count}
			}
		}
		waitForJobEvents(t, stream, events, expected)

		current = procs
	}
}
Esempio n. 12
0
func (s *SchedulerSuite) TestJobRestartBackoffPolicy(t *c.C) {
	if testCluster == nil {
		t.Skip("cannot determine scheduler backoff period")
	}
	backoffPeriod := testCluster.BackoffPeriod()
	startTimeout := 20 * time.Second
	debugf(t, "job restart backoff period: %s", backoffPeriod)

	app, release := s.createApp(t)

	events := make(chan *ct.JobEvent)
	stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	t.Assert(s.controllerClient(t).PutFormation(&ct.Formation{
		AppID:     app.ID,
		ReleaseID: release.ID,
		Processes: map[string]int{"printer": 1},
	}), c.IsNil)
	_, id := waitForJobEvents(t, stream, events, jobEvents{"printer": {"up": 1}})

	// First restart: scheduled immediately
	s.stopJob(t, id)
	id = waitForJobRestart(t, stream, events, "printer", startTimeout)

	// Second restart after 1 * backoffPeriod
	start := time.Now()
	s.stopJob(t, id)
	id = waitForJobRestart(t, stream, events, "printer", backoffPeriod+startTimeout)
	t.Assert(time.Now().Sub(start) > backoffPeriod, c.Equals, true)

	// Third restart after 2 * backoffPeriod
	start = time.Now()
	s.stopJob(t, id)
	id = waitForJobRestart(t, stream, events, "printer", 2*backoffPeriod+startTimeout)
	t.Assert(time.Now().Sub(start) > 2*backoffPeriod, c.Equals, true)

	// After backoffPeriod has elapsed: scheduled immediately
	time.Sleep(backoffPeriod)
	s.stopJob(t, id)
	waitForJobRestart(t, stream, events, "printer", startTimeout)
}
Esempio n. 13
0
func (s *DeployerSuite) TestRollbackNoService(t *c.C) {
	// create a running release
	app, release := s.createRelease(t, "printer", "all-at-once")

	// deploy a release which will not register the service
	client := s.controllerClient(t)
	release.ID = ""
	printer := release.Processes["printer"]
	printer.Service = "printer"
	printer.Ports = []ct.Port{{
		Port:  12345,
		Proto: "tcp",
		Service: &host.Service{
			Name:   "printer",
			Create: true,
			Check: &host.HealthCheck{
				Type:         "tcp",
				Interval:     100 * time.Millisecond,
				Threshold:    1,
				KillDown:     true,
				StartTimeout: 100 * time.Millisecond,
			},
		},
	}}
	release.Processes["printer"] = printer
	t.Assert(client.CreateRelease(release), c.IsNil)
	deployment, err := client.CreateDeployment(app.ID, release.ID)
	t.Assert(err, c.IsNil)

	// check the deployment fails
	events := make(chan *ct.DeploymentEvent)
	stream, err := client.StreamDeployment(deployment, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()
	event := s.waitForDeploymentStatus(t, events, "failed")
	t.Assert(event.Error, c.Equals, "printer process type failed to start, got down job event")

	s.assertRolledBack(t, deployment, map[string]int{"printer": 2})

	// check a new deployment can be created
	_, err = client.CreateDeployment(app.ID, release.ID)
	t.Assert(err, c.IsNil)
}
Esempio n. 14
0
// serveStream creates a subscription and streams out events in SSE format.
func (h *Handler) serveStream(w http.ResponseWriter, params httprouter.Params, kind discoverd.EventKind) {
	// Create a buffered channel to receive events.
	ch := make(chan *discoverd.Event, StreamBufferSize)

	// Subscribe to events on the store.
	service := params.ByName("service")
	stream := h.Store.Subscribe(service, true, kind, ch)

	// Create and serve an SSE stream.
	s := sse.NewStream(w, ch, nil)
	s.Serve()
	s.Wait()
	stream.Close()

	// Check if there was an error while closing.
	if err := stream.Err(); err != nil {
		s.CloseWithError(err)
	}
}
Esempio n. 15
0
func (c *Client) Instances(service string, timeout time.Duration) ([]*Instance, error) {
	s := c.Service(service)
	instances, err := s.Instances()
	if len(instances) > 0 || err != nil && !IsNotFound(err) {
		return instances, err
	}

	events := make(chan *Event)
	stream, err := s.Watch(events)
	if err != nil {
		return nil, err
	}
	defer stream.Close()
	// get any current instances
outer:
	for event := range events {
		switch event.Kind {
		case EventKindCurrent:
			break outer
		case EventKindUp:
			instances = append(instances, event.Instance)
		}
	}
	if len(instances) > 0 {
		return instances, nil
	}
	// wait for an instance to come up
	for {
		select {
		case event, ok := <-events:
			if !ok {
				return nil, stream.Err()
			}
			if event.Kind != EventKindUp {
				continue
			}
			return []*Instance{event.Instance}, nil
		case <-time.After(timeout):
			return nil, ErrTimedOut
		}
	}
}
Esempio n. 16
0
func (c *Client) DeployAppRelease(appID, releaseID string, stopWait <-chan struct{}) error {
	d, err := c.CreateDeployment(appID, releaseID)
	if err != nil {
		return err
	}

	// if initial deploy, just stop here
	if d.FinishedAt != nil {
		return nil
	}

	events := make(chan *ct.DeploymentEvent)
	stream, err := c.StreamDeployment(d, events)
	if err != nil {
		return err
	}
	defer stream.Close()

outer:
	for {
		select {
		case e, ok := <-events:
			if !ok {
				return fmt.Errorf("unexpected close of deployment event stream: %s", stream.Err())
			}
			switch e.Status {
			case "complete":
				break outer
			case "failed":
				return e.Err()
			}
		case <-stopWait:
			return errors.New("deploy wait cancelled")

		}
	}
	return nil
}
Esempio n. 17
0
func (s *Scheduler) unfollowHost(id string) {
	log := logger.New("fn", "unfollowHost", "host.id", id)
	stream, ok := s.hostStreams[id]
	if !ok {
		log.Warn("ignoring host unfollow due to lack of existing stream")
		return
	}

	log.Info("unfollowing host")
	for jobID, job := range s.jobs {
		if job.HostID == id {
			log.Info("removing job", "job.id", jobID)
			s.jobs.SetState(job.JobID, JobStateStopped)
			s.triggerRectify(job.Formation.key())
		}
	}

	log.Info("closing job event stream")
	stream.Close()
	delete(s.hostStreams, id)

	s.triggerSyncFormations()
}
Esempio n. 18
0
func (c *Client) DeployAppReleaseWithTimeout(appID, releaseID string, timeout time.Duration) error {
	d, err := c.CreateDeployment(appID, releaseID)
	if err != nil {
		return err
	}

	// if initial deploy, just stop here
	if d.FinishedAt != nil {
		return nil
	}

	events := make(chan *ct.DeploymentEvent)
	stream, err := c.StreamDeployment(d, events)
	if err != nil {
		return err
	}
	defer stream.Close()
outer:
	for {
		select {
		case e, ok := <-events:
			if !ok {
				return errors.New("unexpected close of deployment event stream")
			}
			switch e.Status {
			case "complete":
				break outer
			case "failed":
				return e.Err()
			}
		case <-time.After(timeout):
			return errors.New("timed out waiting for deployment completion")

		}
	}
	return nil
}
Esempio n. 19
0
func (s *DeployerSuite) TestRollbackFailedJob(t *c.C) {
	// create a running release
	app, release := s.createRelease(t, "printer", "all-at-once")

	// deploy a release which will fail to start
	client := s.controllerClient(t)
	release.ID = ""
	printer := release.Processes["printer"]
	printer.Args = []string{"this-is-gonna-fail"}
	release.Processes["printer"] = printer
	t.Assert(client.CreateRelease(release), c.IsNil)
	deployment, err := client.CreateDeployment(app.ID, release.ID)
	t.Assert(err, c.IsNil)

	// check the deployment fails
	events := make(chan *ct.DeploymentEvent)
	stream, err := client.StreamDeployment(deployment, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()
	event := s.waitForDeploymentStatus(t, events, "failed")
	t.Assert(event.Error, c.Equals, `deployer: printer job failed to start: exec: "this-is-gonna-fail": executable file not found in $PATH`)

	s.assertRolledBack(t, deployment, map[string]int{"printer": 2})
}
Esempio n. 20
0
func (s *HostSuite) TestNotifyOOM(t *c.C) {
	appID := random.UUID()

	// subscribe to init log messages from the logaggregator
	client, err := logaggc.New("")
	t.Assert(err, c.IsNil)
	opts := logagg.LogOpts{
		Follow:      true,
		StreamTypes: []logagg.StreamType{logagg.StreamTypeInit},
	}
	rc, err := client.GetLog(appID, &opts)
	t.Assert(err, c.IsNil)
	defer rc.Close()
	msgs := make(chan *logaggc.Message)
	stream := stream.New()
	defer stream.Close()
	go func() {
		defer close(msgs)
		dec := json.NewDecoder(rc)
		for {
			var msg logaggc.Message
			if err := dec.Decode(&msg); err != nil {
				stream.Error = err
				return
			}
			select {
			case msgs <- &msg:
			case <-stream.StopCh:
				return
			}
		}
	}()

	// run the OOM job
	cmd := exec.CommandUsingCluster(
		s.clusterClient(t),
		s.createArtifact(t, "test-apps"),
		"/bin/oom",
	)
	cmd.Meta = map[string]string{"flynn-controller.app": appID}
	runErr := make(chan error)
	go func() {
		runErr <- cmd.Run()
	}()

	// wait for the OOM notification
	for {
		select {
		case err := <-runErr:
			t.Assert(err, c.IsNil)
		case msg, ok := <-msgs:
			if !ok {
				t.Fatalf("message stream closed unexpectedly: %s", stream.Err())
			}
			t.Log(msg.Msg)
			if strings.Contains(msg.Msg, "FATAL: a container process was killed due to lack of available memory") {
				return
			}
		case <-time.After(30 * time.Second):
			t.Fatal("timed out waiting for OOM notification")
		}
	}
}
Esempio n. 21
0
func (s *SchedulerSuite) TestJobStatus(t *c.C) {
	app, release := s.createApp(t)

	events := make(chan *ct.JobEvent)
	stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	// start 2 formation processes and 1 one-off job
	t.Assert(s.controllerClient(t).PutFormation(&ct.Formation{
		AppID:     app.ID,
		ReleaseID: release.ID,
		Processes: map[string]int{"printer": 1, "crasher": 1},
	}), c.IsNil)
	_, err = s.controllerClient(t).RunJobDetached(app.ID, &ct.NewJob{
		ReleaseID: release.ID,
		Cmd:       []string{"sh", "-c", "while true; do echo one-off-job; sleep 1; done"},
	})
	t.Assert(err, c.IsNil)
	waitForJobEvents(t, stream, events, jobEvents{"printer": {"up": 1}, "crasher": {"up": 1}, "": {"up": 1}})

	list, err := s.controllerClient(t).JobList(app.ID)
	t.Assert(err, c.IsNil)
	t.Assert(list, c.HasLen, 3)
	jobs := make(map[string]*ct.Job, len(list))
	for _, job := range list {
		debug(t, job.Type, "job started with ID ", job.ID)
		jobs[job.Type] = job
	}

	// Check jobs are marked as up once started
	t.Assert(jobs["printer"].State, c.Equals, "up")
	t.Assert(jobs["crasher"].State, c.Equals, "up")
	t.Assert(jobs[""].State, c.Equals, "up")

	// Check that when a formation's job is removed, it is marked as down and a new one is scheduled
	job := jobs["printer"]
	s.stopJob(t, job.ID)
	waitForJobEvents(t, stream, events, jobEvents{"printer": {"down": 1, "up": 1}})
	s.checkJobState(t, app.ID, job.ID, "down")
	list, err = s.controllerClient(t).JobList(app.ID)
	t.Assert(err, c.IsNil)
	t.Assert(list, c.HasLen, 4)

	// Check that when a one-off job is removed, it is marked as down but a new one is not scheduled
	job = jobs[""]
	s.stopJob(t, job.ID)
	waitForJobEvents(t, stream, events, jobEvents{"": {"down": 1}})
	s.checkJobState(t, app.ID, job.ID, "down")
	list, err = s.controllerClient(t).JobList(app.ID)
	t.Assert(err, c.IsNil)
	t.Assert(list, c.HasLen, 4)

	// Check that when a job errors, it is marked as crashed and a new one is started
	job = jobs["crasher"]
	s.stopJob(t, job.ID)
	waitForJobEvents(t, stream, events, jobEvents{"crasher": {"down": 1, "up": 1}})
	s.checkJobState(t, app.ID, job.ID, "crashed")
	list, err = s.controllerClient(t).JobList(app.ID)
	t.Assert(err, c.IsNil)
	t.Assert(list, c.HasLen, 5)
}
Esempio n. 22
0
func (s *SchedulerSuite) TestOmniJobs(t *c.C) {
	if testCluster == nil {
		t.Skip("cannot boot new hosts")
	}

	app, release := s.createApp(t)

	events := make(chan *ct.JobEvent)
	stream, err := s.controllerClient(t).StreamJobEvents(app.ID, 0, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	formation := &ct.Formation{
		AppID:     app.ID,
		ReleaseID: release.ID,
		Processes: make(map[string]int),
	}

	current := make(map[string]int)
	updates := []map[string]int{
		{"printer": 2},
		{"printer": 3, "omni": 2},
		{"printer": 1, "omni": 1},
	}

	for _, procs := range updates {
		debugf(t, "scaling formation to %v", procs)
		formation.Processes = procs
		t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil)

		expected := make(jobEvents)
		for typ, count := range procs {
			diff := count - current[typ]
			if typ == "omni" {
				diff *= testCluster.Size()
			}
			if diff > 0 {
				expected[typ] = map[string]int{"up": diff}
			} else {
				expected[typ] = map[string]int{"down": -diff}
			}
		}
		for typ, count := range current {
			if _, ok := procs[typ]; !ok {
				diff := count
				if typ == "omni" {
					diff *= testCluster.Size()
				}
				expected[typ] = map[string]int{"down": diff}
			}
		}
		waitForJobEvents(t, stream, events, expected)

		current = procs
	}

	// Check that new hosts get omni jobs
	newHosts := s.addHosts(t, 2, false)
	defer s.removeHosts(t, newHosts)
	waitForJobEvents(t, stream, events, jobEvents{"omni": {"up": 2}})
}
Esempio n. 23
0
func (s *DeployerSuite) TestOmniProcess(t *c.C) {
	if testCluster == nil {
		t.Skip("cannot determine test cluster size")
	}

	// create and scale an omni release
	omniScale := 2
	totalJobs := omniScale * testCluster.Size()
	client := s.controllerClient(t)
	app, release := s.createApp(t)

	watcher, err := client.WatchJobEvents(app.Name, release.ID)
	t.Assert(err, c.IsNil)
	defer watcher.Close()

	t.Assert(client.PutFormation(&ct.Formation{
		AppID:     app.ID,
		ReleaseID: release.ID,
		Processes: map[string]int{"omni": omniScale},
	}), c.IsNil)
	err = watcher.WaitFor(ct.JobEvents{"omni": {ct.JobStateUp: totalJobs}}, scaleTimeout, nil)
	t.Assert(err, c.IsNil)

	// deploy using all-at-once and check we get the correct events
	app.Strategy = "all-at-once"
	t.Assert(client.UpdateApp(app), c.IsNil)
	release.ID = ""
	t.Assert(client.CreateRelease(release), c.IsNil)
	deployment, err := client.CreateDeployment(app.ID, release.ID)
	t.Assert(err, c.IsNil)
	events := make(chan *ct.DeploymentEvent)
	stream, err := client.StreamDeployment(deployment, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()
	expected := make([]*ct.Job, 0, 3*totalJobs+1)
	appendEvents := func(releaseID string, state ct.JobState, count int) {
		for i := 0; i < count; i++ {
			expected = append(expected, &ct.Job{
				ReleaseID: releaseID,
				Type:      "omni",
				State:     state,
			})
		}
	}
	appendEvents(deployment.NewReleaseID, ct.JobStateUp, totalJobs)
	appendEvents(deployment.OldReleaseID, ct.JobStateDown, totalJobs)
	s.waitForDeploymentStatus(t, events, "complete")

	// deploy using one-by-one and check we get the correct events
	app.Strategy = "one-by-one"
	t.Assert(client.UpdateApp(app), c.IsNil)
	release.ID = ""
	t.Assert(client.CreateRelease(release), c.IsNil)
	// try creating the deployment multiple times to avoid getting a
	// "Cannot create deploy, one is already in progress" error (there
	// is no guarantee the previous deploy has finished yet)
	attempts := attempt.Strategy{Total: 10 * time.Second, Delay: 100 * time.Millisecond}
	err = attempts.Run(func() (err error) {
		deployment, err = client.CreateDeployment(app.ID, release.ID)
		return
	})
	t.Assert(err, c.IsNil)
	events = make(chan *ct.DeploymentEvent)
	stream, err = client.StreamDeployment(deployment, events)
	t.Assert(err, c.IsNil)
	expected = make([]*ct.Job, 0, 4*totalJobs+1)
	appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size())
	appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size())
	appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size())
	appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size())
	s.waitForDeploymentStatus(t, events, "complete")
}
Esempio n. 24
0
func (s *SchedulerSuite) TestControllerRestart(t *c.C) {
	// get the current controller details
	app, err := s.controllerClient(t).GetApp("controller")
	t.Assert(err, c.IsNil)
	release, err := s.controllerClient(t).GetAppRelease("controller")
	t.Assert(err, c.IsNil)
	formation, err := s.controllerClient(t).GetFormation(app.ID, release.ID)
	t.Assert(err, c.IsNil)
	list, err := s.controllerClient(t).JobList("controller")
	t.Assert(err, c.IsNil)
	var jobs []*ct.Job
	for _, job := range list {
		if job.Type == "web" && job.State == "up" {
			jobs = append(jobs, job)
		}
	}
	t.Assert(jobs, c.HasLen, 2)
	hostID, jobID, _ := cluster.ParseJobID(jobs[0].ID)
	t.Assert(hostID, c.Not(c.Equals), "")
	t.Assert(jobID, c.Not(c.Equals), "")
	debugf(t, "current controller app[%s] host[%s] job[%s]", app.ID, hostID, jobID)

	// start a second controller and wait for it to come up
	events := make(chan *ct.JobEvent)
	stream, err := s.controllerClient(t).StreamJobEvents("controller", 0, events)
	t.Assert(err, c.IsNil)
	debug(t, "scaling the controller up")
	formation.Processes["web"]++
	t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil)
	lastID, _ := waitForJobEvents(t, stream, events, jobEvents{"web": {"up": 1}})
	stream.Close()

	// get direct client for new controller
	var client *controller.Client
	attempts := attempt.Strategy{
		Total: 10 * time.Second,
		Delay: 500 * time.Millisecond,
	}
	t.Assert(attempts.Run(func() (err error) {
		addrs, err := s.discoverdClient(t).Service("flynn-controller").Addrs()
		if err != nil {
			return err
		}
		if len(addrs) != 3 {
			return fmt.Errorf("expected 3 controller processes, got %d", len(addrs))
		}
		addr := addrs[2]
		debug(t, "new controller address: ", addr)
		client, err = controller.NewClient("http://"+addr, s.clusterConf(t).Key)
		if err != nil {
			return err
		}
		events = make(chan *ct.JobEvent)
		stream, err = client.StreamJobEvents("controller", lastID, events)
		return
	}), c.IsNil)
	defer stream.Close()

	// kill the first controller and check the scheduler brings it back online
	cc, err := cluster.NewClientWithServices(s.discoverdClient(t).Service)
	t.Assert(err, c.IsNil)
	hc, err := cc.DialHost(hostID)
	t.Assert(err, c.IsNil)
	debug(t, "stopping job ", jobID)
	t.Assert(hc.StopJob(jobID), c.IsNil)
	waitForJobEvents(t, stream, events, jobEvents{"web": {"down": 1, "up": 1}})

	// scale back down
	debug(t, "scaling the controller down")
	formation.Processes["web"]--
	t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil)
	waitForJobEvents(t, stream, events, jobEvents{"web": {"down": 1}})

	// unset the suite's client so other tests use a new client
	s.controller = nil
}
Esempio n. 25
0
func (s *DeployerSuite) createDeployment(t *c.C, process, strategy, service string) *testDeploy {
	app, release := s.createRelease(t, process, strategy)

	if service != "" {
		debugf(t, "waiting for 2 %s services", service)
		events := make(chan *discoverd.Event)
		stream, err := s.discoverdClient(t).Service(service).Watch(events)
		t.Assert(err, c.IsNil)
		defer stream.Close()
		count := 0
	loop:
		for {
			select {
			case event, ok := <-events:
				if !ok {
					t.Fatalf("service discovery stream closed unexpectedly")
				}
				if event.Kind == discoverd.EventKindUp {
					if id, ok := event.Instance.Meta["FLYNN_RELEASE_ID"]; !ok || id != release.ID {
						continue
					}
					debugf(t, "got %s service up event", service)
					count++
				}
				if count == 2 {
					// although the services are up, give them a few more seconds
					// to make sure the deployer will also see them as up.
					time.Sleep(5 * time.Second)
					break loop
				}
			case <-time.After(10 * time.Second):
				t.Fatalf("timed out waiting for %s service to come up", service)
			}
		}
	}

	client := s.controllerClient(t)
	jobEvents := make(chan *ct.Job)
	jobStream, err := client.StreamJobEvents(app.ID, jobEvents)
	t.Assert(err, c.IsNil)

	// create a new release for the deployment
	release.ID = ""
	t.Assert(client.CreateRelease(release), c.IsNil)

	deployment, err := client.CreateDeployment(app.ID, release.ID)
	t.Assert(err, c.IsNil)
	debugf(t, "created deployment %s", deployment.ID)
	debugf(t, "deploying from release %s to %s", deployment.OldReleaseID, deployment.NewReleaseID)

	deployEvents := make(chan *ct.DeploymentEvent)
	deployStream, err := client.StreamDeployment(deployment, deployEvents)
	t.Assert(err, c.IsNil)

	return &testDeploy{
		s:            s,
		t:            t,
		deployment:   deployment,
		deployEvents: deployEvents,
		deployStream: deployStream,
		jobEvents:    jobEvents,
		jobStream:    jobStream,
	}
}
Esempio n. 26
0
func (c *context) watchHost(h *cluster.Host, ready chan struct{}, stop chan struct{}) {
	if !c.hosts.Add(h.ID()) {
		if ready != nil {
			ready <- struct{}{}
		}
		return
	}
	defer c.hosts.Remove(h.ID())

	g := grohl.NewContext(grohl.Data{"fn": "watchHost", "host.id": h.ID()})

	c.hosts.Set(h.ID(), h)

	g.Log(grohl.Data{"at": "start"})

	ch := make(chan *host.Event)
	stream, err := h.StreamEvents("all", ch)
	if err != nil {
		panic(err)
	}
	go func() {
		<-stop
		stream.Close()
	}()
	if ready != nil {
		ready <- struct{}{}
	}

	// Call PutJob in a goroutine so we don't block receiving job events whilst potentially
	// making multiple requests to the controller (e.g. if the controller is down).
	//
	// Use a channel (rather than spawning a goroutine per event) so that events are delivered in order.
	jobs := make(chan *ct.Job, 10)
	go func() {
		for job := range jobs {
			putJobAttempts.Run(func() error {
				if err := c.PutJob(job); err != nil {
					g.Log(grohl.Data{"at": "put_job_error", "job.id": job.ID, "state": job.State, "err": err})
					// ignore validation / not found errors
					if httphelper.IsValidationError(err) || err == controller.ErrNotFound {
						return nil
					}
					return err
				}
				g.Log(grohl.Data{"at": "put_job", "job.id": job.ID, "state": job.State})
				return nil
			})
		}
	}()

	for event := range ch {
		meta := event.Job.Job.Metadata
		appID := meta["flynn-controller.app"]
		releaseID := meta["flynn-controller.release"]
		jobType := meta["flynn-controller.type"]

		if appID == "" || releaseID == "" {
			continue
		}

		job := &ct.Job{
			ID:        event.JobID,
			AppID:     appID,
			ReleaseID: releaseID,
			Type:      jobType,
			State:     jobState(event),
			Meta:      jobMetaFromMetadata(meta),
		}
		g.Log(grohl.Data{"at": "event", "job.id": event.JobID, "event": event.Event})
		jobs <- job

		// get a read lock on the mutex to ensure we are not currently
		// syncing with the cluster
		c.mtx.RLock()
		j := c.jobs.Get(h.ID(), event.JobID)
		c.mtx.RUnlock()
		if j == nil {
			continue
		}
		j.startedAt = event.Job.StartedAt

		if event.Event != "error" && event.Event != "stop" {
			continue
		}
		g.Log(grohl.Data{"at": "remove", "job.id": event.JobID, "event": event.Event})

		c.jobs.Remove(h.ID(), event.JobID)
		go func(event *host.Event) {
			c.mtx.RLock()
			j.Formation.RestartJob(jobType, h.ID(), event.JobID)
			c.mtx.RUnlock()
		}(event)
	}
	// TODO: check error/reconnect
}