Beispiel #1
0
// APIWorker returns a Worker that connects to the API and starts any
// workers that need an API connection.
//
// If a state worker is necessary, APIWorker calls ensureStateWorker.
func (a *MachineAgent) APIWorker(ensureStateWorker func()) (worker.Worker, error) {
	st, entity, err := openAPIState(a.Conf.Conf, a)
	if err != nil {
		// There was an error connecting to the API,
		// https://launchpad.net/bugs/1199915 means that we may just
		// not have an API password set. So force a state connection at
		// this point.
		// TODO(jam): Once we can reliably trust that we have API
		//            passwords set, and we no longer need state
		//            connections (and possibly agents will be blocked
		//            from connecting directly to state) we can remove
		//            this. Currently needed because 1.10 does not set
		//            the API password and 1.11 requires it
		ensureStateWorker()
		return nil, err
	}
	m := entity.(*machineagent.Machine)
	needsStateWorker := false
	for _, job := range m.Jobs() {
		needsStateWorker = needsStateWorker || stateJobs[job]
	}
	if needsStateWorker {
		ensureStateWorker()
	}
	runner := worker.NewRunner(allFatal, moreImportant)
	// Only the machiner currently connects to the API.
	// Add other workers here as they are ready.
	runner.StartWorker("machiner", func() (worker.Worker, error) {
		return machiner.NewMachiner(st.Machiner(), a.Tag()), nil
	})
	return newCloseWorker(runner, st), nil // Note: a worker.Runner is itself a worker.Worker.
}
func (*runnerSuite) TestOneWorkerStartWhenStopping(c *C) {
	worker.RestartDelay = 3 * time.Second
	runner := worker.NewRunner(allFatal, noImportance)
	starter := newTestWorkerStarter()
	starter.stopWait = make(chan struct{})

	err := runner.StartWorker("id", testWorkerStart(starter))
	c.Assert(err, IsNil)
	starter.assertStarted(c, true)
	err = runner.StopWorker("id")
	c.Assert(err, IsNil)
	err = runner.StartWorker("id", testWorkerStart(starter))
	c.Assert(err, IsNil)

	close(starter.stopWait)
	starter.assertStarted(c, false)
	// Check that the task is restarted immediately without
	// the usual restart timeout delay.
	t0 := time.Now()
	starter.assertStarted(c, true)
	restartDuration := time.Since(t0)
	if restartDuration > 1*time.Second {
		c.Fatalf("task did not restart immediately")
	}
	c.Assert(worker.Stop(runner), IsNil)
}
func (*runnerSuite) TestOneWorkerStartFatalError(c *C) {
	runner := worker.NewRunner(allFatal, noImportance)
	starter := newTestWorkerStarter()
	starter.startErr = errors.New("cannot start test task")
	err := runner.StartWorker("id", testWorkerStart(starter))
	c.Assert(err, IsNil)
	err = runner.Wait()
	c.Assert(err, Equals, starter.startErr)
}
func (*runnerSuite) TestOneWorkerStart(c *C) {
	runner := worker.NewRunner(noneFatal, noImportance)
	starter := newTestWorkerStarter()
	err := runner.StartWorker("id", testWorkerStart(starter))
	c.Assert(err, IsNil)
	starter.assertStarted(c, true)

	c.Assert(worker.Stop(runner), IsNil)
	starter.assertStarted(c, false)
}
Beispiel #5
0
// Init initializes the command for running.
func (a *MachineAgent) Init(args []string) error {
	if !state.IsMachineId(a.MachineId) {
		return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer")
	}
	if err := a.Conf.checkArgs(args); err != nil {
		return err
	}
	a.runner = worker.NewRunner(isFatal, moreImportant)
	return nil
}
Beispiel #6
0
// APIWorker returns a Worker that connects to the API and starts any
// workers that need an API connection.
//
// If a state worker is necessary, APIWorker calls ensureStateWorker.
func (a *MachineAgent) APIWorker(ensureStateWorker func()) (worker.Worker, error) {
	st, entity, err := openAPIState(a.Conf.Conf, a)
	if err != nil {
		// There was an error connecting to the API,
		// https://launchpad.net/bugs/1199915 means that we may just
		// not have an API password set. So force a state connection at
		// this point.
		// TODO(jam): Once we can reliably trust that we have API
		//            passwords set, and we no longer need state
		//            connections (and possibly agents will be blocked
		//            from connecting directly to state) we can remove
		//            this. Currently needed because 1.10 does not set
		//            the API password and 1.11 requires it
		ensureStateWorker()
		return nil, err
	}
	needsStateWorker := false
	for _, job := range entity.Jobs() {
		needsStateWorker = needsStateWorker || stateJobs[job]
	}
	if needsStateWorker {
		ensureStateWorker()
	}
	runner := worker.NewRunner(allFatal, moreImportant)
	// Only the machiner currently connects to the API.
	// Add other workers here as they are ready.
	runner.StartWorker("machiner", func() (worker.Worker, error) {
		return machiner.NewMachiner(st.Machiner(), a.Tag()), nil
	})
	runner.StartWorker("upgrader", func() (worker.Worker, error) {
		// TODO(rog) use id instead of *Machine (or introduce Clone method)
		return upgrader.New(st.Upgrader(), a.Tag(), a.Conf.DataDir), nil
	})
	for _, job := range entity.Jobs() {
		switch job {
		case params.JobHostUnits:
			deployerTask, err := newDeployer(st.Deployer(), a.Tag(), a.Conf.DataDir)
			if err != nil {
				return nil, err
			}
			runner.StartWorker("deployer", func() (worker.Worker, error) {
				return deployerTask, nil
			})
		case params.JobManageEnviron:
			// Not yet implemented with the API.
		case params.JobManageState:
			// Not yet implemented with the API.
		default:
			// TODO(dimitern): Once all workers moved over to using
			// the API, report "unknown job type" here.
		}
	}
	return newCloseWorker(runner, st), nil // Note: a worker.Runner is itself a worker.Worker.
}
func (*runnerSuite) TestOneWorkerDieFatalError(c *C) {
	runner := worker.NewRunner(allFatal, noImportance)
	starter := newTestWorkerStarter()
	err := runner.StartWorker("id", testWorkerStart(starter))
	c.Assert(err, IsNil)
	starter.assertStarted(c, true)
	dieErr := errors.New("error when running")
	starter.die <- dieErr
	err = runner.Wait()
	c.Assert(err, Equals, dieErr)
	starter.assertStarted(c, false)
}
func (*runnerSuite) TestOneWorkerStopFatalError(c *C) {
	runner := worker.NewRunner(allFatal, noImportance)
	starter := newTestWorkerStarter()
	starter.stopErr = errors.New("stop error")
	err := runner.StartWorker("id", testWorkerStart(starter))
	c.Assert(err, IsNil)
	starter.assertStarted(c, true)
	err = runner.StopWorker("id")
	c.Assert(err, IsNil)
	err = runner.Wait()
	c.Assert(err, Equals, starter.stopErr)
}
Beispiel #9
0
func (a *UnitAgent) APIWorkers() (worker.Worker, error) {
	st, entity, err := openAPIState(a.Conf.Conf, a)
	if err != nil {
		return nil, err
	}
	dataDir := a.Conf.DataDir
	runner := worker.NewRunner(allFatal, moreImportant)
	runner.StartWorker("upgrader", func() (worker.Worker, error) {
		return upgrader.New(st.Upgrader(), entity.Tag(), dataDir), nil
	})
	return newCloseWorker(runner, st), nil
}
Beispiel #10
0
// Init initializes the command for running.
func (a *UnitAgent) Init(args []string) error {
	if a.UnitName == "" {
		return requiredError("unit-name")
	}
	if !names.IsUnit(a.UnitName) {
		return fmt.Errorf(`--unit-name option expects "<service>/<n>" argument`)
	}
	if err := a.Conf.checkArgs(args); err != nil {
		return err
	}
	a.runner = worker.NewRunner(isFatal, moreImportant)
	return nil
}
Beispiel #11
0
// StateWorkers returns a worker that runs the unit agent workers.
func (a *UnitAgent) StateWorkers() (worker.Worker, error) {
	st, entity, err := openState(a.Conf.Conf, a)
	if err != nil {
		return nil, err
	}
	unit := entity.(*state.Unit)
	dataDir := a.Conf.DataDir
	runner := worker.NewRunner(allFatal, moreImportant)
	runner.StartWorker("uniter", func() (worker.Worker, error) {
		return uniter.NewUniter(st, unit.Name(), dataDir), nil
	})
	return newCloseWorker(runner, st), nil
}
Beispiel #12
0
func (*runnerSuite) TestOneWorkerRestartDelay(c *C) {
	worker.RestartDelay = 100 * time.Millisecond
	runner := worker.NewRunner(noneFatal, noImportance)
	starter := newTestWorkerStarter()
	err := runner.StartWorker("id", testWorkerStart(starter))
	c.Assert(err, IsNil)
	starter.assertStarted(c, true)
	starter.die <- fmt.Errorf("non-fatal error")
	starter.assertStarted(c, false)
	t0 := time.Now()
	starter.assertStarted(c, true)
	restartDuration := time.Since(t0)
	if restartDuration < worker.RestartDelay {
		c.Fatalf("restart delay was not respected; got %v want %v", restartDuration, worker.RestartDelay)
	}
}
Beispiel #13
0
func (*runnerSuite) TestErrorImportance(c *C) {
	moreImportant := func(err0, err1 error) bool {
		return err0.(errorLevel) > err1.(errorLevel)
	}
	id := func(i int) string { return fmt.Sprint(i) }
	runner := worker.NewRunner(allFatal, moreImportant)
	for i := 0; i < 10; i++ {
		starter := newTestWorkerStarter()
		starter.stopErr = errorLevel(i)
		err := runner.StartWorker(id(i), testWorkerStart(starter))
		c.Assert(err, IsNil)
	}
	err := runner.StopWorker(id(4))
	c.Assert(err, IsNil)
	err = runner.Wait()
	c.Assert(err, Equals, errorLevel(9))
}
Beispiel #14
0
func (*runnerSuite) TestOneWorkerRestart(c *C) {
	runner := worker.NewRunner(noneFatal, noImportance)
	starter := newTestWorkerStarter()
	err := runner.StartWorker("id", testWorkerStart(starter))
	c.Assert(err, IsNil)
	starter.assertStarted(c, true)

	// Check it restarts a few times time.
	for i := 0; i < 3; i++ {
		starter.die <- fmt.Errorf("an error")
		starter.assertStarted(c, false)
		starter.assertStarted(c, true)
	}

	c.Assert(worker.Stop(runner), IsNil)
	starter.assertStarted(c, false)
}
Beispiel #15
0
func (*runnerSuite) TestAllWorkersStoppedWhenOneDiesWithFatalError(c *C) {
	runner := worker.NewRunner(allFatal, noImportance)
	var starters []*testWorkerStarter
	for i := 0; i < 10; i++ {
		starter := newTestWorkerStarter()
		err := runner.StartWorker(fmt.Sprint(i), testWorkerStart(starter))
		c.Assert(err, IsNil)
		starters = append(starters, starter)
	}
	for _, starter := range starters {
		starter.assertStarted(c, true)
	}
	dieErr := errors.New("fatal error")
	starters[4].die <- dieErr
	err := runner.Wait()
	c.Assert(err, Equals, dieErr)
	for _, starter := range starters {
		starter.assertStarted(c, false)
	}
}
Beispiel #16
0
func (*runnerSuite) TestFatalErrorWhileSelfStartWorker(c *C) {
	// Original deadlock problem that this tests for:
	// A worker tries to call StartWorker in its start function
	// at the same time another worker dies with a fatal error.
	// It might not be able to send on startc.
	runner := worker.NewRunner(allFatal, noImportance)

	selfStarter := newTestWorkerStarter()
	// make the startNotify channel synchronous so
	// we can delay the start indefinitely.
	selfStarter.startNotify = make(chan bool)
	selfStarter.hook = func() {
		runner.StartWorker("another", func() (worker.Worker, error) {
			return nil, fmt.Errorf("no worker started")
		})
	}
	err := runner.StartWorker("self starter", testWorkerStart(selfStarter))
	c.Assert(err, IsNil)

	fatalStarter := newTestWorkerStarter()
	fatalStarter.startErr = fmt.Errorf("a fatal error")

	err = runner.StartWorker("fatal worker", testWorkerStart(fatalStarter))
	c.Assert(err, IsNil)

	// Wait for the runner loop to react to the fatal
	// error and go into final shutdown mode.
	time.Sleep(10 * time.Millisecond)

	// At this point, the loop is in shutdown mode, but the
	// selfStarter's worker is still in its start function.
	// When the start function continues (the first assertStarted
	// allows that to happen) it will try to create a new
	// worker. This failed in an earlier version of the code because the
	// loop was not ready to receive start requests.

	selfStarter.assertStarted(c, true)
	selfStarter.assertStarted(c, false)
	err = runner.Wait()
	c.Assert(err, Equals, fatalStarter.startErr)
}
Beispiel #17
0
// Workers returns a worker that runs the unit agent workers.
func (a *UnitAgent) Workers() (worker.Worker, error) {
	st, entity, err := openState(a.Conf.Conf, a)
	if err != nil {
		return nil, err
	}
	if err := EnsureAPIInfo(a.Conf.Conf, st, entity); err != nil {
		// We suppress this error, because it is probably more interesting
		// to see other failures, but we log it, in case it is a root cause
		agentLogger.Warningf("error while calling EnsureAPIInfo: %v", err)
	}
	unit := entity.(*state.Unit)
	dataDir := a.Conf.DataDir
	runner := worker.NewRunner(allFatal, moreImportant)
	runner.StartWorker("upgrader", func() (worker.Worker, error) {
		return NewUpgrader(st, unit, dataDir), nil
	})
	runner.StartWorker("uniter", func() (worker.Worker, error) {
		return uniter.NewUniter(st, unit.Name(), dataDir), nil
	})
	return newCloseWorker(runner, st), nil
}
Beispiel #18
0
func (*runnerSuite) TestFatalErrorWhileStarting(c *C) {
	// Original deadlock problem that this tests for:
	// A worker dies with fatal error while another worker
	// is inside start(). runWorker can't send startInfo on startedc.
	runner := worker.NewRunner(allFatal, noImportance)

	slowStarter := newTestWorkerStarter()
	// make the startNotify channel synchronous so
	// we can delay the start indefinitely.
	slowStarter.startNotify = make(chan bool)

	err := runner.StartWorker("slow starter", testWorkerStart(slowStarter))
	c.Assert(err, IsNil)

	fatalStarter := newTestWorkerStarter()
	fatalStarter.startErr = fmt.Errorf("a fatal error")

	err = runner.StartWorker("fatal worker", testWorkerStart(fatalStarter))
	c.Assert(err, IsNil)

	// Wait for the runner loop to react to the fatal
	// error and go into final shutdown mode.
	time.Sleep(10 * time.Millisecond)

	// At this point, the loop is in shutdown mode, but the
	// slowStarter's worker is still in its start function.
	// When the start function continues (the first assertStarted
	// allows that to happen) and returns the new Worker,
	// runWorker will try to send it on runner.startedc.
	// This test makes sure that succeeds ok.

	slowStarter.assertStarted(c, true)
	slowStarter.assertStarted(c, false)
	err = runner.Wait()
	c.Assert(err, Equals, fatalStarter.startErr)
}
Beispiel #19
0
// StateJobs returns a worker running all the workers that require
// a *state.State connection.
func (a *MachineAgent) StateWorker() (worker.Worker, error) {
	st, entity, err := openState(a.Conf.Conf, a)
	if err != nil {
		return nil, err
	}
	// If this fails, other bits will fail, so we just log the error, and
	// let the other failures actually restart runners
	if err := EnsureAPIInfo(a.Conf.Conf, st, entity); err != nil {
		log.Warningf("failed to EnsureAPIInfo: %v", err)
	}
	reportOpenedState(st)
	m := entity.(*state.Machine)
	// TODO(rog) use more discriminating test for errors
	// rather than taking everything down indiscriminately.
	dataDir := a.Conf.DataDir
	runner := worker.NewRunner(allFatal, moreImportant)
	runner.StartWorker("upgrader", func() (worker.Worker, error) {
		// TODO(rog) use id instead of *Machine (or introduce Clone method)
		return NewUpgrader(st, m, dataDir), nil
	})
	// At this stage, since we don't embed lxc containers, just start an lxc
	// provisioner task for non-lxc containers.  Since we have only LXC
	// containers and normal machines, this effectively means that we only
	// have an LXC provisioner when we have a normally provisioned machine
	// (through the environ-provisioner).  With the upcoming advent of KVM
	// containers, it is likely that we will want an LXC provisioner on a KVM
	// machine, and once we get nested LXC containers, we can remove this
	// check.
	providerType := os.Getenv("JUJU_PROVIDER_TYPE")
	if providerType != provider.Local && m.ContainerType() != instance.LXC {
		workerName := fmt.Sprintf("%s-provisioner", provisioner.LXC)
		runner.StartWorker(workerName, func() (worker.Worker, error) {
			return provisioner.NewProvisioner(provisioner.LXC, st, a.MachineId, dataDir), nil
		})
	}
	// Take advantage of special knowledge here in that we will only ever want
	// the storage provider on one machine, and that is the "bootstrap" node.
	if providerType == provider.Local && m.Id() == bootstrapMachineId {
		runner.StartWorker("local-storage", func() (worker.Worker, error) {
			return localstorage.NewWorker(), nil
		})
	}
	for _, job := range m.Jobs() {
		switch job {
		case state.JobHostUnits:
			runner.StartWorker("deployer", func() (worker.Worker, error) {
				return newDeployer(st, m.Id(), dataDir), nil
			})
		case state.JobManageEnviron:
			runner.StartWorker("environ-provisioner", func() (worker.Worker, error) {
				return provisioner.NewProvisioner(provisioner.ENVIRON, st, a.MachineId, dataDir), nil
			})
			runner.StartWorker("firewaller", func() (worker.Worker, error) {
				return firewaller.NewFirewaller(st), nil
			})
		case state.JobManageState:
			runner.StartWorker("apiserver", func() (worker.Worker, error) {
				// If the configuration does not have the required information,
				// it is currently not a recoverable error, so we kill the whole
				// agent, potentially enabling human intervention to fix
				// the agent's configuration file. In the future, we may retrieve
				// the state server certificate and key from the state, and
				// this should then change.
				if len(a.Conf.StateServerCert) == 0 || len(a.Conf.StateServerKey) == 0 {
					return nil, &fatalError{"configuration does not have state server cert/key"}
				}
				return apiserver.NewServer(st, fmt.Sprintf(":%d", a.Conf.APIPort), a.Conf.StateServerCert, a.Conf.StateServerKey)
			})
			runner.StartWorker("cleaner", func() (worker.Worker, error) {
				return cleaner.NewCleaner(st), nil
			})
			runner.StartWorker("resumer", func() (worker.Worker, error) {
				// The action of resumer is so subtle that it is not tested,
				// because we can't figure out how to do so without brutalising
				// the transaction log.
				return resumer.NewResumer(st), nil
			})
		default:
			log.Warningf("ignoring unknown job %q", job)
		}
	}
	return newCloseWorker(runner, st), nil
}
Beispiel #20
0
func (*runnerSuite) TestStopWorkerWhenDead(c *C) {
	runner := worker.NewRunner(allFatal, noImportance)
	c.Assert(worker.Stop(runner), IsNil)
	c.Assert(runner.StopWorker("foo"), Equals, worker.ErrDead)
}