func (s *Scheduler) RunPutJobs() { log := logger.New("fn", "RunPutJobs") log.Info("starting job persistence loop") strategy := attempt.Strategy{Delay: 100 * time.Millisecond, Total: time.Minute} for job := range s.putJobs { err := strategy.RunWithValidator(func() error { return s.PutJob(job) }, httphelper.IsRetryableError) if err != nil { log.Error("error persisting job", "job.id", job.ID, "job.state", job.State, "err", err) } } log.Info("stopping job persistence loop") }
func (v *vm) Run(command string, attempts attempt.Strategy, out io.Writer, stderr io.Writer) error { var sc *ssh.Client err := attempts.Run(func() (err error) { fmt.Fprintf(stderr, "Attempting to ssh to %s:22...\n", v.IP()) sc, err = v.DialSSH() return }) if err != nil { return err } defer sc.Close() sess, err := sc.NewSession() sess.Stdin = bytes.NewBufferString(command) sess.Stdout = out sess.Stderr = stderr if err := sess.Run("bash"); err != nil { return fmt.Errorf("failed to run command on %s: %s", v.IP(), err) } return nil }
func (S) TestAttemptTiming(c *C) { testAttempt := attempt.Strategy{ Total: 0.25e9, Delay: 0.1e9, } want := []time.Duration{0, 0.1e9, 0.2e9, 0.2e9} got := make([]time.Duration, 0, len(want)) // avoid allocation when testing timing t0 := time.Now() for a := testAttempt.Start(); a.Next(); { got = append(got, time.Now().Sub(t0)) } got = append(got, time.Now().Sub(t0)) c.Assert(got, HasLen, len(want)) const margin = 0.01e9 for i, got := range want { lo := want[i] - margin hi := want[i] + margin if got < lo || got > hi { c.Errorf("attempt %d want %g got %g", i, want[i].Seconds(), got.Seconds()) } } }
func (s *DeployerSuite) TestOmniProcess(t *c.C) { if testCluster == nil { t.Skip("cannot determine test cluster size") } // create and scale an omni release omniScale := 2 totalJobs := omniScale * testCluster.Size() client := s.controllerClient(t) app, release := s.createApp(t) watcher, err := client.WatchJobEvents(app.Name, release.ID) t.Assert(err, c.IsNil) defer watcher.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"omni": omniScale}, }), c.IsNil) err = watcher.WaitFor(ct.JobEvents{"omni": {ct.JobStateUp: totalJobs}}, scaleTimeout, nil) t.Assert(err, c.IsNil) // deploy using all-at-once and check we get the correct events app.Strategy = "all-at-once" t.Assert(client.UpdateApp(app), c.IsNil) release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) events := make(chan *ct.DeploymentEvent) stream, err := client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) defer stream.Close() expected := make([]*ct.Job, 0, 3*totalJobs+1) appendEvents := func(releaseID string, state ct.JobState, count int) { for i := 0; i < count; i++ { expected = append(expected, &ct.Job{ ReleaseID: releaseID, Type: "omni", State: state, }) } } appendEvents(deployment.NewReleaseID, ct.JobStateUp, totalJobs) appendEvents(deployment.OldReleaseID, ct.JobStateDown, totalJobs) s.waitForDeploymentStatus(t, events, "complete") // deploy using one-by-one and check we get the correct events app.Strategy = "one-by-one" t.Assert(client.UpdateApp(app), c.IsNil) release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) // try creating the deployment multiple times to avoid getting a // "Cannot create deploy, one is already in progress" error (there // is no guarantee the previous deploy has finished yet) attempts := attempt.Strategy{Total: 10 * time.Second, Delay: 100 * time.Millisecond} err = attempts.Run(func() (err error) { deployment, err = client.CreateDeployment(app.ID, release.ID) return }) t.Assert(err, c.IsNil) events = make(chan *ct.DeploymentEvent) stream, err = client.StreamDeployment(deployment, events) t.Assert(err, c.IsNil) expected = make([]*ct.Job, 0, 4*totalJobs+1) appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size()) appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size()) appendEvents(deployment.NewReleaseID, ct.JobStateUp, testCluster.Size()) appendEvents(deployment.OldReleaseID, ct.JobStateDown, testCluster.Size()) s.waitForDeploymentStatus(t, events, "complete") }
func (S) TestAttemptNextHasNext(c *C) { a := attempt.Strategy{}.Start() c.Assert(a.Next(), Equals, true) c.Assert(a.Next(), Equals, false) a = attempt.Strategy{}.Start() c.Assert(a.Next(), Equals, true) c.Assert(a.HasNext(), Equals, false) c.Assert(a.Next(), Equals, false) a = attempt.Strategy{Total: 2e8}.Start() c.Assert(a.Next(), Equals, true) c.Assert(a.HasNext(), Equals, true) time.Sleep(2e8) c.Assert(a.HasNext(), Equals, true) c.Assert(a.Next(), Equals, true) c.Assert(a.Next(), Equals, false) a = attempt.Strategy{Total: 1e8, Min: 2}.Start() time.Sleep(1e8) c.Assert(a.Next(), Equals, true) c.Assert(a.HasNext(), Equals, true) c.Assert(a.Next(), Equals, true) c.Assert(a.HasNext(), Equals, false) c.Assert(a.Next(), Equals, false) }
func (s *Scheduler) streamFormationEvents() error { log := logger.New("fn", "streamFormationEvents") var events chan *ct.ExpandedFormation var stream stream.Stream var since *time.Time connect := func() (err error) { log.Info("connecting formation event stream") events = make(chan *ct.ExpandedFormation, eventBufferSize) stream, err = s.StreamFormations(since, events) if err != nil { log.Error("error connecting formation event stream", "err", err) } return } strategy := attempt.Strategy{Delay: 100 * time.Millisecond, Total: time.Minute} if err := strategy.Run(connect); err != nil { return err } current := make(chan struct{}) go func() { var isCurrent bool outer: for { for formation := range events { // an empty formation indicates we now have the // current list of formations. if formation.App == nil { if !isCurrent { isCurrent = true close(current) } continue } since = &formation.UpdatedAt // if we are not current, explicitly handle the event // so that the scheduler has the current list of // formations before starting the main loop. if !isCurrent { s.HandleFormationChange(formation) continue } s.formationEvents <- formation } log.Warn("formation event stream disconnected", "err", stream.Err()) for { if err := connect(); err == nil { continue outer } time.Sleep(100 * time.Millisecond) } } }() select { case <-current: return nil case <-time.After(30 * time.Second): return errors.New("timed out waiting for current formation list") } }
func (s *SchedulerSuite) TestControllerRestart(t *c.C) { // get the current controller details app, err := s.controllerClient(t).GetApp("controller") t.Assert(err, c.IsNil) release, err := s.controllerClient(t).GetAppRelease("controller") t.Assert(err, c.IsNil) list, err := s.controllerClient(t).JobList("controller") t.Assert(err, c.IsNil) var jobs []*ct.Job for _, job := range list { if job.Type == "web" { jobs = append(jobs, job) } } t.Assert(jobs, c.HasLen, 1) hostID, jobID, _ := cluster.ParseJobID(jobs[0].ID) t.Assert(hostID, c.Not(c.Equals), "") t.Assert(jobID, c.Not(c.Equals), "") debugf(t, "current controller app[%s] host[%s] job[%s]", app.ID, hostID, jobID) // start a second controller and wait for it to come up stream, err := s.controllerClient(t).StreamJobEvents("controller", 0) t.Assert(err, c.IsNil) debug(t, "scaling the controller up") t.Assert(s.controllerClient(t).PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"web": 2, "scheduler": 1}, }), c.IsNil) lastID, _ := waitForJobEvents(t, stream.Events, jobEvents{"web": {"up": 1}}) stream.Close() // get direct client for new controller var client *controller.Client attempts := attempt.Strategy{ Total: 10 * time.Second, Delay: 500 * time.Millisecond, } t.Assert(attempts.Run(func() (err error) { set, err := s.discoverdClient(t).NewServiceSet("flynn-controller") if err != nil { return err } defer set.Close() addrs := set.Addrs() if len(addrs) != 2 { return fmt.Errorf("expected 2 controller processes, got %d", len(addrs)) } addr := addrs[1] debug(t, "new controller address: ", addr) client, err = controller.NewClient("http://"+addr, s.clusterConf(t).Key) return }), c.IsNil) // kill the first controller and check the scheduler brings it back online stream, err = client.StreamJobEvents("controller", lastID) defer stream.Close() t.Assert(err, c.IsNil) cc, err := cluster.NewClientWithDial(nil, s.discoverdClient(t).NewServiceSet) t.Assert(err, c.IsNil) defer cc.Close() hc, err := cc.DialHost(hostID) t.Assert(err, c.IsNil) defer hc.Close() debug(t, "stopping job ", jobID) t.Assert(hc.StopJob(jobID), c.IsNil) waitForJobEvents(t, stream.Events, jobEvents{"web": {"down": 1, "up": 1}}) // scale back down debug(t, "scaling the controller down") t.Assert(s.controllerClient(t).PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{"web": 1, "scheduler": 1}, }), c.IsNil) waitForJobEvents(t, stream.Events, jobEvents{"web": {"down": 1}}) // unset the suite's client so other tests use a new client s.controller = nil }