// APIWorker returns a Worker that connects to the API and starts any // workers that need an API connection. // // If a state worker is necessary, APIWorker calls ensureStateWorker. func (a *MachineAgent) APIWorker(ensureStateWorker func()) (worker.Worker, error) { st, entity, err := openAPIState(a.Conf.Conf, a) if err != nil { // There was an error connecting to the API, // https://launchpad.net/bugs/1199915 means that we may just // not have an API password set. So force a state connection at // this point. // TODO(jam): Once we can reliably trust that we have API // passwords set, and we no longer need state // connections (and possibly agents will be blocked // from connecting directly to state) we can remove // this. Currently needed because 1.10 does not set // the API password and 1.11 requires it ensureStateWorker() return nil, err } m := entity.(*machineagent.Machine) needsStateWorker := false for _, job := range m.Jobs() { needsStateWorker = needsStateWorker || stateJobs[job] } if needsStateWorker { ensureStateWorker() } runner := worker.NewRunner(allFatal, moreImportant) // Only the machiner currently connects to the API. // Add other workers here as they are ready. runner.StartWorker("machiner", func() (worker.Worker, error) { return machiner.NewMachiner(st.Machiner(), a.Tag()), nil }) return newCloseWorker(runner, st), nil // Note: a worker.Runner is itself a worker.Worker. }
func (*runnerSuite) TestOneWorkerStartWhenStopping(c *C) { worker.RestartDelay = 3 * time.Second runner := worker.NewRunner(allFatal, noImportance) starter := newTestWorkerStarter() starter.stopWait = make(chan struct{}) err := runner.StartWorker("id", testWorkerStart(starter)) c.Assert(err, IsNil) starter.assertStarted(c, true) err = runner.StopWorker("id") c.Assert(err, IsNil) err = runner.StartWorker("id", testWorkerStart(starter)) c.Assert(err, IsNil) close(starter.stopWait) starter.assertStarted(c, false) // Check that the task is restarted immediately without // the usual restart timeout delay. t0 := time.Now() starter.assertStarted(c, true) restartDuration := time.Since(t0) if restartDuration > 1*time.Second { c.Fatalf("task did not restart immediately") } c.Assert(worker.Stop(runner), IsNil) }
func (*runnerSuite) TestOneWorkerStartFatalError(c *C) { runner := worker.NewRunner(allFatal, noImportance) starter := newTestWorkerStarter() starter.startErr = errors.New("cannot start test task") err := runner.StartWorker("id", testWorkerStart(starter)) c.Assert(err, IsNil) err = runner.Wait() c.Assert(err, Equals, starter.startErr) }
func (*runnerSuite) TestOneWorkerStart(c *C) { runner := worker.NewRunner(noneFatal, noImportance) starter := newTestWorkerStarter() err := runner.StartWorker("id", testWorkerStart(starter)) c.Assert(err, IsNil) starter.assertStarted(c, true) c.Assert(worker.Stop(runner), IsNil) starter.assertStarted(c, false) }
// Init initializes the command for running. func (a *MachineAgent) Init(args []string) error { if !state.IsMachineId(a.MachineId) { return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer") } if err := a.Conf.checkArgs(args); err != nil { return err } a.runner = worker.NewRunner(isFatal, moreImportant) return nil }
// APIWorker returns a Worker that connects to the API and starts any // workers that need an API connection. // // If a state worker is necessary, APIWorker calls ensureStateWorker. func (a *MachineAgent) APIWorker(ensureStateWorker func()) (worker.Worker, error) { st, entity, err := openAPIState(a.Conf.Conf, a) if err != nil { // There was an error connecting to the API, // https://launchpad.net/bugs/1199915 means that we may just // not have an API password set. So force a state connection at // this point. // TODO(jam): Once we can reliably trust that we have API // passwords set, and we no longer need state // connections (and possibly agents will be blocked // from connecting directly to state) we can remove // this. Currently needed because 1.10 does not set // the API password and 1.11 requires it ensureStateWorker() return nil, err } needsStateWorker := false for _, job := range entity.Jobs() { needsStateWorker = needsStateWorker || stateJobs[job] } if needsStateWorker { ensureStateWorker() } runner := worker.NewRunner(allFatal, moreImportant) // Only the machiner currently connects to the API. // Add other workers here as they are ready. runner.StartWorker("machiner", func() (worker.Worker, error) { return machiner.NewMachiner(st.Machiner(), a.Tag()), nil }) runner.StartWorker("upgrader", func() (worker.Worker, error) { // TODO(rog) use id instead of *Machine (or introduce Clone method) return upgrader.New(st.Upgrader(), a.Tag(), a.Conf.DataDir), nil }) for _, job := range entity.Jobs() { switch job { case params.JobHostUnits: deployerTask, err := newDeployer(st.Deployer(), a.Tag(), a.Conf.DataDir) if err != nil { return nil, err } runner.StartWorker("deployer", func() (worker.Worker, error) { return deployerTask, nil }) case params.JobManageEnviron: // Not yet implemented with the API. case params.JobManageState: // Not yet implemented with the API. default: // TODO(dimitern): Once all workers moved over to using // the API, report "unknown job type" here. } } return newCloseWorker(runner, st), nil // Note: a worker.Runner is itself a worker.Worker. }
func (*runnerSuite) TestOneWorkerDieFatalError(c *C) { runner := worker.NewRunner(allFatal, noImportance) starter := newTestWorkerStarter() err := runner.StartWorker("id", testWorkerStart(starter)) c.Assert(err, IsNil) starter.assertStarted(c, true) dieErr := errors.New("error when running") starter.die <- dieErr err = runner.Wait() c.Assert(err, Equals, dieErr) starter.assertStarted(c, false) }
func (*runnerSuite) TestOneWorkerStopFatalError(c *C) { runner := worker.NewRunner(allFatal, noImportance) starter := newTestWorkerStarter() starter.stopErr = errors.New("stop error") err := runner.StartWorker("id", testWorkerStart(starter)) c.Assert(err, IsNil) starter.assertStarted(c, true) err = runner.StopWorker("id") c.Assert(err, IsNil) err = runner.Wait() c.Assert(err, Equals, starter.stopErr) }
func (a *UnitAgent) APIWorkers() (worker.Worker, error) { st, entity, err := openAPIState(a.Conf.Conf, a) if err != nil { return nil, err } dataDir := a.Conf.DataDir runner := worker.NewRunner(allFatal, moreImportant) runner.StartWorker("upgrader", func() (worker.Worker, error) { return upgrader.New(st.Upgrader(), entity.Tag(), dataDir), nil }) return newCloseWorker(runner, st), nil }
// Init initializes the command for running. func (a *UnitAgent) Init(args []string) error { if a.UnitName == "" { return requiredError("unit-name") } if !names.IsUnit(a.UnitName) { return fmt.Errorf(`--unit-name option expects "<service>/<n>" argument`) } if err := a.Conf.checkArgs(args); err != nil { return err } a.runner = worker.NewRunner(isFatal, moreImportant) return nil }
// StateWorkers returns a worker that runs the unit agent workers. func (a *UnitAgent) StateWorkers() (worker.Worker, error) { st, entity, err := openState(a.Conf.Conf, a) if err != nil { return nil, err } unit := entity.(*state.Unit) dataDir := a.Conf.DataDir runner := worker.NewRunner(allFatal, moreImportant) runner.StartWorker("uniter", func() (worker.Worker, error) { return uniter.NewUniter(st, unit.Name(), dataDir), nil }) return newCloseWorker(runner, st), nil }
func (*runnerSuite) TestOneWorkerRestartDelay(c *C) { worker.RestartDelay = 100 * time.Millisecond runner := worker.NewRunner(noneFatal, noImportance) starter := newTestWorkerStarter() err := runner.StartWorker("id", testWorkerStart(starter)) c.Assert(err, IsNil) starter.assertStarted(c, true) starter.die <- fmt.Errorf("non-fatal error") starter.assertStarted(c, false) t0 := time.Now() starter.assertStarted(c, true) restartDuration := time.Since(t0) if restartDuration < worker.RestartDelay { c.Fatalf("restart delay was not respected; got %v want %v", restartDuration, worker.RestartDelay) } }
func (*runnerSuite) TestErrorImportance(c *C) { moreImportant := func(err0, err1 error) bool { return err0.(errorLevel) > err1.(errorLevel) } id := func(i int) string { return fmt.Sprint(i) } runner := worker.NewRunner(allFatal, moreImportant) for i := 0; i < 10; i++ { starter := newTestWorkerStarter() starter.stopErr = errorLevel(i) err := runner.StartWorker(id(i), testWorkerStart(starter)) c.Assert(err, IsNil) } err := runner.StopWorker(id(4)) c.Assert(err, IsNil) err = runner.Wait() c.Assert(err, Equals, errorLevel(9)) }
func (*runnerSuite) TestOneWorkerRestart(c *C) { runner := worker.NewRunner(noneFatal, noImportance) starter := newTestWorkerStarter() err := runner.StartWorker("id", testWorkerStart(starter)) c.Assert(err, IsNil) starter.assertStarted(c, true) // Check it restarts a few times time. for i := 0; i < 3; i++ { starter.die <- fmt.Errorf("an error") starter.assertStarted(c, false) starter.assertStarted(c, true) } c.Assert(worker.Stop(runner), IsNil) starter.assertStarted(c, false) }
func (*runnerSuite) TestAllWorkersStoppedWhenOneDiesWithFatalError(c *C) { runner := worker.NewRunner(allFatal, noImportance) var starters []*testWorkerStarter for i := 0; i < 10; i++ { starter := newTestWorkerStarter() err := runner.StartWorker(fmt.Sprint(i), testWorkerStart(starter)) c.Assert(err, IsNil) starters = append(starters, starter) } for _, starter := range starters { starter.assertStarted(c, true) } dieErr := errors.New("fatal error") starters[4].die <- dieErr err := runner.Wait() c.Assert(err, Equals, dieErr) for _, starter := range starters { starter.assertStarted(c, false) } }
func (*runnerSuite) TestFatalErrorWhileSelfStartWorker(c *C) { // Original deadlock problem that this tests for: // A worker tries to call StartWorker in its start function // at the same time another worker dies with a fatal error. // It might not be able to send on startc. runner := worker.NewRunner(allFatal, noImportance) selfStarter := newTestWorkerStarter() // make the startNotify channel synchronous so // we can delay the start indefinitely. selfStarter.startNotify = make(chan bool) selfStarter.hook = func() { runner.StartWorker("another", func() (worker.Worker, error) { return nil, fmt.Errorf("no worker started") }) } err := runner.StartWorker("self starter", testWorkerStart(selfStarter)) c.Assert(err, IsNil) fatalStarter := newTestWorkerStarter() fatalStarter.startErr = fmt.Errorf("a fatal error") err = runner.StartWorker("fatal worker", testWorkerStart(fatalStarter)) c.Assert(err, IsNil) // Wait for the runner loop to react to the fatal // error and go into final shutdown mode. time.Sleep(10 * time.Millisecond) // At this point, the loop is in shutdown mode, but the // selfStarter's worker is still in its start function. // When the start function continues (the first assertStarted // allows that to happen) it will try to create a new // worker. This failed in an earlier version of the code because the // loop was not ready to receive start requests. selfStarter.assertStarted(c, true) selfStarter.assertStarted(c, false) err = runner.Wait() c.Assert(err, Equals, fatalStarter.startErr) }
// Workers returns a worker that runs the unit agent workers. func (a *UnitAgent) Workers() (worker.Worker, error) { st, entity, err := openState(a.Conf.Conf, a) if err != nil { return nil, err } if err := EnsureAPIInfo(a.Conf.Conf, st, entity); err != nil { // We suppress this error, because it is probably more interesting // to see other failures, but we log it, in case it is a root cause agentLogger.Warningf("error while calling EnsureAPIInfo: %v", err) } unit := entity.(*state.Unit) dataDir := a.Conf.DataDir runner := worker.NewRunner(allFatal, moreImportant) runner.StartWorker("upgrader", func() (worker.Worker, error) { return NewUpgrader(st, unit, dataDir), nil }) runner.StartWorker("uniter", func() (worker.Worker, error) { return uniter.NewUniter(st, unit.Name(), dataDir), nil }) return newCloseWorker(runner, st), nil }
func (*runnerSuite) TestFatalErrorWhileStarting(c *C) { // Original deadlock problem that this tests for: // A worker dies with fatal error while another worker // is inside start(). runWorker can't send startInfo on startedc. runner := worker.NewRunner(allFatal, noImportance) slowStarter := newTestWorkerStarter() // make the startNotify channel synchronous so // we can delay the start indefinitely. slowStarter.startNotify = make(chan bool) err := runner.StartWorker("slow starter", testWorkerStart(slowStarter)) c.Assert(err, IsNil) fatalStarter := newTestWorkerStarter() fatalStarter.startErr = fmt.Errorf("a fatal error") err = runner.StartWorker("fatal worker", testWorkerStart(fatalStarter)) c.Assert(err, IsNil) // Wait for the runner loop to react to the fatal // error and go into final shutdown mode. time.Sleep(10 * time.Millisecond) // At this point, the loop is in shutdown mode, but the // slowStarter's worker is still in its start function. // When the start function continues (the first assertStarted // allows that to happen) and returns the new Worker, // runWorker will try to send it on runner.startedc. // This test makes sure that succeeds ok. slowStarter.assertStarted(c, true) slowStarter.assertStarted(c, false) err = runner.Wait() c.Assert(err, Equals, fatalStarter.startErr) }
// StateJobs returns a worker running all the workers that require // a *state.State connection. func (a *MachineAgent) StateWorker() (worker.Worker, error) { st, entity, err := openState(a.Conf.Conf, a) if err != nil { return nil, err } // If this fails, other bits will fail, so we just log the error, and // let the other failures actually restart runners if err := EnsureAPIInfo(a.Conf.Conf, st, entity); err != nil { log.Warningf("failed to EnsureAPIInfo: %v", err) } reportOpenedState(st) m := entity.(*state.Machine) // TODO(rog) use more discriminating test for errors // rather than taking everything down indiscriminately. dataDir := a.Conf.DataDir runner := worker.NewRunner(allFatal, moreImportant) runner.StartWorker("upgrader", func() (worker.Worker, error) { // TODO(rog) use id instead of *Machine (or introduce Clone method) return NewUpgrader(st, m, dataDir), nil }) // At this stage, since we don't embed lxc containers, just start an lxc // provisioner task for non-lxc containers. Since we have only LXC // containers and normal machines, this effectively means that we only // have an LXC provisioner when we have a normally provisioned machine // (through the environ-provisioner). With the upcoming advent of KVM // containers, it is likely that we will want an LXC provisioner on a KVM // machine, and once we get nested LXC containers, we can remove this // check. providerType := os.Getenv("JUJU_PROVIDER_TYPE") if providerType != provider.Local && m.ContainerType() != instance.LXC { workerName := fmt.Sprintf("%s-provisioner", provisioner.LXC) runner.StartWorker(workerName, func() (worker.Worker, error) { return provisioner.NewProvisioner(provisioner.LXC, st, a.MachineId, dataDir), nil }) } // Take advantage of special knowledge here in that we will only ever want // the storage provider on one machine, and that is the "bootstrap" node. if providerType == provider.Local && m.Id() == bootstrapMachineId { runner.StartWorker("local-storage", func() (worker.Worker, error) { return localstorage.NewWorker(), nil }) } for _, job := range m.Jobs() { switch job { case state.JobHostUnits: runner.StartWorker("deployer", func() (worker.Worker, error) { return newDeployer(st, m.Id(), dataDir), nil }) case state.JobManageEnviron: runner.StartWorker("environ-provisioner", func() (worker.Worker, error) { return provisioner.NewProvisioner(provisioner.ENVIRON, st, a.MachineId, dataDir), nil }) runner.StartWorker("firewaller", func() (worker.Worker, error) { return firewaller.NewFirewaller(st), nil }) case state.JobManageState: runner.StartWorker("apiserver", func() (worker.Worker, error) { // If the configuration does not have the required information, // it is currently not a recoverable error, so we kill the whole // agent, potentially enabling human intervention to fix // the agent's configuration file. In the future, we may retrieve // the state server certificate and key from the state, and // this should then change. if len(a.Conf.StateServerCert) == 0 || len(a.Conf.StateServerKey) == 0 { return nil, &fatalError{"configuration does not have state server cert/key"} } return apiserver.NewServer(st, fmt.Sprintf(":%d", a.Conf.APIPort), a.Conf.StateServerCert, a.Conf.StateServerKey) }) runner.StartWorker("cleaner", func() (worker.Worker, error) { return cleaner.NewCleaner(st), nil }) runner.StartWorker("resumer", func() (worker.Worker, error) { // The action of resumer is so subtle that it is not tested, // because we can't figure out how to do so without brutalising // the transaction log. return resumer.NewResumer(st), nil }) default: log.Warningf("ignoring unknown job %q", job) } } return newCloseWorker(runner, st), nil }
func (*runnerSuite) TestStopWorkerWhenDead(c *C) { runner := worker.NewRunner(allFatal, noImportance) c.Assert(worker.Stop(runner), IsNil) c.Assert(runner.StopWorker("foo"), Equals, worker.ErrDead) }