示例#1
0
func (s *Scheduler) RunPutJobs() {
	log := logger.New("fn", "RunPutJobs")
	log.Info("starting job persistence loop")
	strategy := attempt.Strategy{Delay: 100 * time.Millisecond, Total: time.Minute}
	for job := range s.putJobs {
		err := strategy.RunWithValidator(func() error {
			return s.PutJob(job)
		}, httphelper.IsRetryableError)
		if err != nil {
			log.Error("error persisting job", "job.id", job.ID, "job.state", job.State, "err", err)
		}
	}
	log.Info("stopping job persistence loop")
}
示例#2
0
func (v *vm) Run(command string, attempts attempt.Strategy, out io.Writer, stderr io.Writer) error {
	var sc *ssh.Client
	err := attempts.Run(func() (err error) {
		fmt.Fprintf(stderr, "Attempting to ssh to %s:22...\n", v.IP())
		sc, err = v.DialSSH()
		return
	})
	if err != nil {
		return err
	}
	defer sc.Close()
	sess, err := sc.NewSession()
	sess.Stdin = bytes.NewBufferString(command)
	sess.Stdout = out
	sess.Stderr = stderr
	if err := sess.Run("bash"); err != nil {
		return fmt.Errorf("failed to run command on %s: %s", v.IP(), err)
	}
	return nil
}
示例#3
0
func (S) TestAttemptTiming(c *C) {
	testAttempt := attempt.Strategy{
		Total: 0.25e9,
		Delay: 0.1e9,
	}
	want := []time.Duration{0, 0.1e9, 0.2e9, 0.2e9}
	got := make([]time.Duration, 0, len(want)) // avoid allocation when testing timing
	t0 := time.Now()
	for a := testAttempt.Start(); a.Next(); {
		got = append(got, time.Now().Sub(t0))
	}
	got = append(got, time.Now().Sub(t0))
	c.Assert(got, HasLen, len(want))
	const margin = 0.01e9
	for i, got := range want {
		lo := want[i] - margin
		hi := want[i] + margin
		if got < lo || got > hi {
			c.Errorf("attempt %d want %g got %g", i, want[i].Seconds(), got.Seconds())
		}
	}
}
示例#4
0
func (s *DeployerSuite) TestOmniProcess(t *c.C) {
	if testCluster == nil {
		t.Skip("cannot determine test cluster size")
	}

	// create and scale an omni release
	omniScale := 2
	totalJobs := omniScale * testCluster.Size()
	client := s.controllerClient(t)
	app, release := s.createApp(t)

	watcher, err := client.WatchJobEvents(app.Name, release.ID)
	t.Assert(err, c.IsNil)
	defer watcher.Close()

	t.Assert(client.PutFormation(&ct.Formation{
		AppID:     app.ID,
		ReleaseID: release.ID,
		Processes: map[string]int{"omni": omniScale},
	}), c.IsNil)
	err = watcher.WaitFor(ct.JobEvents{"omni": {ct.JobStateUp: totalJobs}}, scaleTimeout, nil)
	t.Assert(err, c.IsNil)

	// deploy using all-at-once and check we get the correct events
	app.Strategy = "all-at-once"
	t.Assert(client.UpdateApp(app), c.IsNil)
	release.ID = ""
	t.Assert(client.CreateRelease(release), c.IsNil)
	deployment, err := client.CreateDeployment(app.ID, release.ID)
	t.Assert(err, c.IsNil)
	events := make(chan *ct.DeploymentEvent)
	stream, err := client.StreamDeployment(deployment, events)
	t.Assert(err, c.IsNil)
	defer stream.Close()
	expected := make([]*ct.Job, 0, 3*totalJobs+1)
	appendEvents := func(releaseID string, state ct.JobState, count int) {
		for i := 0; i < count; i++ {
			expected = append(expected, &ct.Job{
				ReleaseID: releaseID,
				Type:      "omni",
				State:     state,
			})
		}
	}
	appendEvents(deployment.NewReleaseID, ct.JobStateUp, totalJobs)
	appendEvents(deployment.OldReleaseID, ct.JobStateDown, totalJobs)
	s.waitForDeploymentStatus(t, events, "complete")

	// deploy using one-by-one and check we get the correct events
	app.Strategy = "one-by-one"
	t.Assert(client.UpdateApp(app), c.IsNil)
	release.ID = ""
	t.Assert(client.CreateRelease(release), c.IsNil)
	// try creating the deployment multiple times to avoid getting a
	// "Cannot create deploy, one is already in progress" error (there
	// is no guarantee the previous deploy has finished yet)
	attempts := attempt.Strategy{Total: 10 * time.Second, Delay: 100 * time.Millisecond}
	err = attempts.Run(func() (err error) {
		deployment, err = client.CreateDeployment(app.ID, release.ID)
		return
	})
	t.Assert(err, c.IsNil)
	events = make(chan *ct.DeploymentEvent)
	stream, err = client.StreamDeployment(deployment, events)
	t.Assert(err, c.IsNil)
	expected = make([]*ct.Job, 0, 4*totalJobs+1)
	appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size())
	appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size())
	appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size())
	appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size())
	s.waitForDeploymentStatus(t, events, "complete")
}
示例#5
0
func (S) TestAttemptNextHasNext(c *C) {
	a := attempt.Strategy{}.Start()
	c.Assert(a.Next(), Equals, true)
	c.Assert(a.Next(), Equals, false)

	a = attempt.Strategy{}.Start()
	c.Assert(a.Next(), Equals, true)
	c.Assert(a.HasNext(), Equals, false)
	c.Assert(a.Next(), Equals, false)

	a = attempt.Strategy{Total: 2e8}.Start()
	c.Assert(a.Next(), Equals, true)
	c.Assert(a.HasNext(), Equals, true)
	time.Sleep(2e8)
	c.Assert(a.HasNext(), Equals, true)
	c.Assert(a.Next(), Equals, true)
	c.Assert(a.Next(), Equals, false)

	a = attempt.Strategy{Total: 1e8, Min: 2}.Start()
	time.Sleep(1e8)
	c.Assert(a.Next(), Equals, true)
	c.Assert(a.HasNext(), Equals, true)
	c.Assert(a.Next(), Equals, true)
	c.Assert(a.HasNext(), Equals, false)
	c.Assert(a.Next(), Equals, false)
}
示例#6
0
func (s *Scheduler) streamFormationEvents() error {
	log := logger.New("fn", "streamFormationEvents")

	var events chan *ct.ExpandedFormation
	var stream stream.Stream
	var since *time.Time
	connect := func() (err error) {
		log.Info("connecting formation event stream")
		events = make(chan *ct.ExpandedFormation, eventBufferSize)
		stream, err = s.StreamFormations(since, events)
		if err != nil {
			log.Error("error connecting formation event stream", "err", err)
		}
		return
	}
	strategy := attempt.Strategy{Delay: 100 * time.Millisecond, Total: time.Minute}
	if err := strategy.Run(connect); err != nil {
		return err
	}

	current := make(chan struct{})
	go func() {
		var isCurrent bool
	outer:
		for {
			for formation := range events {
				// an empty formation indicates we now have the
				// current list of formations.
				if formation.App == nil {
					if !isCurrent {
						isCurrent = true
						close(current)
					}
					continue
				}
				since = &formation.UpdatedAt
				// if we are not current, explicitly handle the event
				// so that the scheduler has the current list of
				// formations before starting the main loop.
				if !isCurrent {
					s.HandleFormationChange(formation)
					continue
				}
				s.formationEvents <- formation
			}
			log.Warn("formation event stream disconnected", "err", stream.Err())
			for {
				if err := connect(); err == nil {
					continue outer
				}
				time.Sleep(100 * time.Millisecond)
			}
		}
	}()

	select {
	case <-current:
		return nil
	case <-time.After(30 * time.Second):
		return errors.New("timed out waiting for current formation list")
	}
}
示例#7
0
func (s *SchedulerSuite) TestControllerRestart(t *c.C) {
	// get the current controller details
	app, err := s.controllerClient(t).GetApp("controller")
	t.Assert(err, c.IsNil)
	release, err := s.controllerClient(t).GetAppRelease("controller")
	t.Assert(err, c.IsNil)
	list, err := s.controllerClient(t).JobList("controller")
	t.Assert(err, c.IsNil)
	var jobs []*ct.Job
	for _, job := range list {
		if job.Type == "web" {
			jobs = append(jobs, job)
		}
	}
	t.Assert(jobs, c.HasLen, 1)
	hostID, jobID, _ := cluster.ParseJobID(jobs[0].ID)
	t.Assert(hostID, c.Not(c.Equals), "")
	t.Assert(jobID, c.Not(c.Equals), "")
	debugf(t, "current controller app[%s] host[%s] job[%s]", app.ID, hostID, jobID)

	// start a second controller and wait for it to come up
	stream, err := s.controllerClient(t).StreamJobEvents("controller", 0)
	t.Assert(err, c.IsNil)
	debug(t, "scaling the controller up")
	t.Assert(s.controllerClient(t).PutFormation(&ct.Formation{
		AppID:     app.ID,
		ReleaseID: release.ID,
		Processes: map[string]int{"web": 2, "scheduler": 1},
	}), c.IsNil)
	lastID, _ := waitForJobEvents(t, stream.Events, jobEvents{"web": {"up": 1}})
	stream.Close()

	// get direct client for new controller
	var client *controller.Client
	attempts := attempt.Strategy{
		Total: 10 * time.Second,
		Delay: 500 * time.Millisecond,
	}
	t.Assert(attempts.Run(func() (err error) {
		set, err := s.discoverdClient(t).NewServiceSet("flynn-controller")
		if err != nil {
			return err
		}
		defer set.Close()
		addrs := set.Addrs()
		if len(addrs) != 2 {
			return fmt.Errorf("expected 2 controller processes, got %d", len(addrs))
		}
		addr := addrs[1]
		debug(t, "new controller address: ", addr)
		client, err = controller.NewClient("http://"+addr, s.clusterConf(t).Key)
		return
	}), c.IsNil)

	// kill the first controller and check the scheduler brings it back online
	stream, err = client.StreamJobEvents("controller", lastID)
	defer stream.Close()
	t.Assert(err, c.IsNil)
	cc, err := cluster.NewClientWithDial(nil, s.discoverdClient(t).NewServiceSet)
	t.Assert(err, c.IsNil)
	defer cc.Close()
	hc, err := cc.DialHost(hostID)
	t.Assert(err, c.IsNil)
	defer hc.Close()
	debug(t, "stopping job ", jobID)
	t.Assert(hc.StopJob(jobID), c.IsNil)
	waitForJobEvents(t, stream.Events, jobEvents{"web": {"down": 1, "up": 1}})

	// scale back down
	debug(t, "scaling the controller down")
	t.Assert(s.controllerClient(t).PutFormation(&ct.Formation{
		AppID:     app.ID,
		ReleaseID: release.ID,
		Processes: map[string]int{"web": 1, "scheduler": 1},
	}), c.IsNil)
	waitForJobEvents(t, stream.Events, jobEvents{"web": {"down": 1}})

	// unset the suite's client so other tests use a new client
	s.controller = nil
}