Beispiel #1
func (self *Engine) ResolveJobOffer(jobName string, machBootId string) error {
	log.V(2).Infof("Attempting to lock JobOffer(%s)", jobName)
	mutex := self.lockJobOffer(jobName)

	if mutex == nil {
		log.V(2).Infof("Could not lock JobOffer(%s)", jobName)
		return errors.New("Could not lock JobOffer")
	defer mutex.Unlock()

	log.V(2).Infof("Claimed JobOffer(%s)", jobName)

	log.V(2).Infof("Resolving JobOffer(%s), scheduling to Machine(%s)", jobName, machBootId)
	err := self.registry.ResolveJobOffer(jobName)
	if err != nil {
		log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err)
		return err

	err = self.registry.ScheduleJob(jobName, machBootId)
	if err != nil {
		log.Errorf("Failed scheduling Job(%s): %v", jobName, err)
		return err

	log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machBootId)
	return nil
Beispiel #2
func (e *Engine) ResolveJobOffer(jobName string, machID string) error {
	log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName)
	mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID)

	if mutex == nil {
		log.V(1).Infof("Could not lock JobOffer(%s)", jobName)
		return errors.New("Could not lock JobOffer")
	defer mutex.Unlock()

	log.V(1).Infof("Claimed JobOffer(%s)", jobName)

	err := e.registry.ResolveJobOffer(jobName)
	if err != nil {
		log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err)
		return err

	err = e.registry.ScheduleJob(jobName, machID)
	if err != nil {
		log.Errorf("Failed scheduling Job(%s): %v", jobName, err)
		return err

	log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID)
	return nil
Beispiel #3
func UpdateLoggingFlagsFromConfig(conf *Config) {
	err := flag.Lookup("v").Value.Set(strconv.Itoa(conf.Verbosity))
	if err != nil {
		glog.Errorf("Failed to apply config.Verbosity to flag.v: %v", err)

	err = flag.Lookup("logtostderr").Value.Set("true")
	if err != nil {
		glog.Errorf("Failed to set flag.logtostderr to true: %v", err)
Beispiel #4
func (eh *EventHandler) HandleEventJobScheduled(ev event.Event) {
	jobName := ev.Payload.(string)
	target := ev.Context.(string)

	defer eh.agent.state.Unlock()

	log.V(1).Infof("EventJobScheduled(%s): Dropping outstanding offers and bids", jobName)

	if target != eh.agent.Machine().State().ID {
		log.Infof("EventJobScheduled(%s): Job not scheduled to this Agent, purging related data from cache", jobName)

		log.Infof("EventJobScheduled(%s): Checking outstanding job offers", jobName)

	log.Infof("EventJobScheduled(%s): Job scheduled to this Agent", jobName)

	j := eh.agent.FetchJob(jobName)
	if j == nil {
		log.Errorf("EventJobScheduled(%s): Failed to fetch Job", jobName)

	if !eh.agent.VerifyJob(j) {
		log.Errorf("EventJobScheduled(%s): Failed to verify Job", j.Name)

	if !eh.agent.AbleToRun(j) {
		log.Infof("EventJobScheduled(%s): Unable to run scheduled Job, unscheduling.", jobName)
		eh.agent.registry.ClearJobTarget(jobName, target)

	log.Infof("EventJobScheduled(%s): Loading Job", j.Name)

	log.Infof("EventJobScheduled(%s): Bidding for all possible peers of Job", j.Name)

	ts := eh.agent.registry.GetJobTargetState(j.Name)
	if ts == nil || *ts != job.JobStateLaunched {

	log.Infof("EventJobScheduled(%s): Starting Job", j.Name)
Beispiel #5
// Periodically report to the Registry at an interval equal to
// half of the provided ttl. Stop reporting when the provided
// channel is closed. Failed attempts to report state to the
// Registry are retried twice before moving on to the next
// reporting interval.
func (a *Agent) Heartbeat(ttl time.Duration, stop chan bool) {
	attempt := func(attempts int, f func() error) (err error) {
		if attempts < 1 {
			return fmt.Errorf("attempts argument must be 1 or greater, got %d", attempts)

		// The amount of time the retry mechanism waits after a failed attempt
		// doubles following each failure. This is a simple exponential backoff.
		sleep := time.Second

		for i := 1; i <= attempts; i++ {
			err = f()
			if err == nil || i == attempts {

			sleep = sleep * 2
			log.V(2).Infof("function returned err, retrying in %v: %v", sleep, err)

		return err

	heartbeat := func() error {
		return a.registry.SetMachineState(a.machine.State(), ttl)

	// Explicitly heartbeat immediately to push state to the
	// Registry as quickly as possible
	if err := attempt(3, heartbeat); err != nil {
		log.Errorf("Failed heartbeat after 3 attempts: %v", err)

	interval := ttl / refreshInterval
	ticker := time.Tick(interval)
	for {
		select {
		case <-stop:
			log.V(2).Info("MachineHeartbeat exiting due to stop signal")
		case <-ticker:
			log.V(2).Info("MachineHeartbeat tick")
			if err := attempt(3, heartbeat); err != nil {
				log.Errorf("Failed heartbeat after 3 attempts: %v", err)
Beispiel #6
func (eh *EventHandler) HandleEventJobScheduled(ev event.Event) {
	jobName := ev.Payload.(string)
	log.V(1).Infof("EventJobScheduled(%s): Dropping outstanding offers and bids", jobName)


	if ev.Context.(machine.MachineState).BootId != eh.agent.Machine().State().BootId {
		log.V(1).Infof("EventJobScheduled(%s): Job not scheduled to this Agent, checking unbade offers", jobName)

	log.V(1).Infof("EventJobScheduled(%s): Job scheduled to this Agent", jobName)

	j := eh.agent.FetchJob(jobName)
	if j == nil {
		log.Errorf("EventJobScheduled(%s): Failed to fetch Job")

	if !eh.agent.AbleToRun(j) {
		log.V(1).Infof("EventJobScheduled(%s): Unable to run scheduled Job, rescheduling.", jobName)

	log.V(1).Infof("EventJobScheduled(%s): Starting Job", j.Name)

	log.V(1).Infof("EventJobScheduled(%s): Bidding for all possible peers of Job", j.Name)
Beispiel #7
func (e *Engine) OfferJob(j job.Job) error {
	log.V(1).Infof("Attempting to lock Job(%s)", j.Name)

	mutex := e.registry.LockJob(j.Name, e.machine.State().ID)
	if mutex == nil {
		log.V(1).Infof("Could not lock Job(%s)", j.Name)
		return errors.New("Could not lock Job")
	defer mutex.Unlock()

	log.V(1).Infof("Claimed Job(%s)", j.Name)

	machineIDs, err := e.partitionCluster(&j)
	if err != nil {
		log.Errorf("Failed partitioning cluster for Job(%s): %v", j.Name, err)
		return err

	offer := job.NewOfferFromJob(j, machineIDs)

	err = e.registry.CreateJobOffer(offer)
	if err == nil {
		log.Infof("Published JobOffer(%s)", offer.Job.Name)

	return err
Beispiel #8
// Publish is a long-running function that streams dbus events through
// a translation layer and on to the EventBus
func (m *SystemdManager) Publish(bus *event.EventBus, stopchan chan bool) {

	changechan, errchan := m.subscriptions.Subscribe()

	stream := NewEventStream()
	stream.Stream(changechan, bus.Channel)

	for true {
		select {
		case <-stopchan:
		case err := <-errchan:
			var errString string
			if err != nil {
				errString = err.Error()
			} else {
				errString = "N/A"
			log.Errorf("Received error from dbus: err=%s", errString)

Beispiel #9
// JobScheduledLocally clears all state related to the indicated
// job's offers/bids before attempting to load and possibly start
// the job. The ability to run the job will be revalidated before
// loading, and unscheduled if such validation fails.
func (a *Agent) JobScheduledLocally(jobName string) {
	defer a.state.Unlock()

	log.Infof("Dropping offer and bid for Job(%s) from cache", jobName)

	j := a.fetchJob(jobName)
	if j == nil {
		log.Errorf("Failed to fetch Job(%s)", jobName)

	if !a.ableToRun(j) {
		log.Infof("Unable to run locally-scheduled Job(%s), unscheduling", jobName)
		a.registry.ClearJobTarget(jobName, a.Machine.State().ID)


	log.Infof("Bidding for all possible peers of Job(%s)", j.Name)

	ts, _ := a.registry.GetJobTargetState(j.Name)
	if ts == nil || *ts != job.JobStateLaunched {

	log.Infof("Job(%s) loaded, now starting it", j.Name)
Beispiel #10
func (m *SystemdManager) stopUnit(name string) {
	if stat, err := m.Systemd.StopUnit(name, "replace"); err != nil {
		log.Errorf("Failed to stop systemd unit %s: %v", name, err)
	} else {
		log.Infof("Stopped systemd unit %s(%s)", name, stat)
Beispiel #11
func (r *Registry) GetAllPayloads() []job.JobPayload {
	var payloads []job.JobPayload

	key := path.Join(keyPrefix, payloadPrefix)
	resp, err := r.etcd.Get(key, true, true)

	if err != nil {
		return payloads

	for _, node := range resp.Node.Nodes {
		var jp job.JobPayload
		//TODO: Handle the error generated by unmarshal
		unmarshal(node.Value, &jp)

		if err != nil {

		payloads = append(payloads, jp)

	return payloads
Beispiel #12
func (self *Engine) OfferJob(j job.Job) error {
	log.V(2).Infof("Attempting to lock Job(%s)", j.Name)

	mutex := self.lockJob(j.Name)
	if mutex == nil {
		log.V(1).Infof("Could not lock Job(%s)", j.Name)
		return errors.New("Could not lock Job")
	defer mutex.Unlock()

	log.V(1).Infof("Claimed Job", j.Name)

	machineBootIds, err := self.partitionCluster(&j)
	if err != nil {
		log.Errorf("Failed partitioning cluster for Job(%s): %v", j.Name, err)
		return err

	offer := job.NewOfferFromJob(j, machineBootIds)

	log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name)
	log.Infof("Published JobOffer(%s)", offer.Job.Name)

	return nil
Beispiel #13
// Purge removes the Agent's state from the Registry
func (a *Agent) Purge() {
	// Continue heartbeating the agent's machine state while attempting to
	// stop all the locally-running jobs
	purged := make(chan bool)
	go a.heartbeatAgent(a.ttl, purged)

	scheduled := a.state.ScheduledJobs()

	machID := a.Machine.State().ID
	for _, jobName := range scheduled {
		log.Infof("Unloading Job(%s) from local machine", jobName)
		log.Infof("Unscheduling Job(%s) from local machine", jobName)
		a.registry.ClearJobTarget(jobName, machID)

	// Jobs have been stopped, the heartbeat can stop

	log.Info("Removing Agent from Registry")
	if err := a.registry.RemoveMachineState(machID); err != nil {
		log.Errorf("Failed to remove Machine %s from Registry: %s", machID, err.Error())
Beispiel #14
func (r *Registry) UnresolvedJobOffers() []job.JobOffer {
	var offers []job.JobOffer

	key := path.Join(keyPrefix, offerPrefix)
	resp, err := r.etcd.Get(key, true, true)

	if err != nil {
		return offers

	for _, node := range resp.Node.Nodes {
		key := path.Join(node.Key, "object")
		resp, err := r.etcd.Get(key, true, true)

		// The object was probably handled between when we attempted to
		// start resolving offers and when we actually tried to get it
		if err != nil {

		var jo job.JobOffer
		err = unmarshal(resp.Node.Value, &jo)
		if err != nil {

		offers = append(offers, jo)

	return offers
Beispiel #15
func (eh *EventHandler) HandleEventMachineRemoved(ev event.Event) {
	machID := ev.Payload.(string)
	mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID)
	if mutex == nil {
		log.V(1).Infof("EventMachineRemoved(%s): failed to lock Machine, ignoring event", machID)
	defer mutex.Unlock()

	jobs := getJobsScheduledToMachine(eh.engine.registry, machID)

	for _, j := range jobs {
		log.Infof("EventMachineRemoved(%s): clearing UnitState(%s)", machID, j.Name)
		err := eh.engine.registry.RemoveUnitState(j.Name)
		if err != nil {
			log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err)

		log.Infof("EventMachineRemoved(%s): unscheduling Job(%s)", machID, j.Name)
		eh.engine.registry.ClearJobTarget(j.Name, machID)

	for _, j := range jobs {
		log.Infof("EventMachineRemoved(%s): re-publishing JobOffer(%s)", machID, j.Name)
Beispiel #16
func (m *SystemdManager) startUnit(name string) {
	log.V(1).Infof("Starting systemd unit %s", name)

	files := []string{name}
	if ok, _, err := m.Systemd.EnableUnitFiles(files, true, false); !ok {
		log.Errorf("Failed to enable systemd unit %s: %v", name, err)
	} else {
		log.V(1).Infof("Enabled systemd unit %s", name)

	if stat, err := m.Systemd.StartUnit(name, "replace"); err != nil {
		log.Errorf("Failed to start systemd unit %s: %v", name, err)
	} else {
		log.Infof("Started systemd unit %s(%s)", name, stat)
Beispiel #17
// UpdateLoggingFlagsFromConfig extracts the logging-related options from
// the provided config and sets flags in the given flagset
func UpdateLoggingFlagsFromConfig(flagset *flag.FlagSet, conf *Config) {
	err := flagset.Lookup("v").Value.Set(strconv.Itoa(conf.Verbosity))
	if err != nil {
		glog.Errorf("Failed to apply config.Verbosity to flag.v: %v", err)

	err = flagset.Lookup("logtostderr").Value.Set("true")
	if err != nil {
		glog.Errorf("Failed to set flag.logtostderr to true: %v", err)

	if conf.Verbosity > 2 {
	} else {
Beispiel #18
// UpdateLoggingFlagsFromConfig extracts the logging-related options from
// the provided config and sets flags in the given flagset
func UpdateLoggingFlagsFromConfig(flagset *flag.FlagSet, conf *Config) {
	err := flagset.Lookup("v").Value.Set(strconv.Itoa(conf.Verbosity))
	if err != nil {
		glog.Errorf("Failed to apply config.Verbosity to flag.v: %v", err)

	err = flagset.Lookup("logtostderr").Value.Set("true")
	if err != nil {
		glog.Errorf("Failed to set flag.logtostderr to true: %v", err)

	if conf.Verbosity > 2 {
		etcd.SetLogger(log.New(os.Stdout, "go-etcd", log.LstdFlags))
	} else {
		etcd.SetLogger(log.New(ioutil.Discard, "go-etcd", log.LstdFlags))
Beispiel #19
func (r *Registry) CreateJobOffer(jo *job.JobOffer) {
	key := path.Join(keyPrefix, offerPrefix, jo.Job.Name, "object")
	json, err := marshal(jo)
	if err != nil {
	r.etcd.Set(key, json, 0)
Beispiel #20
// checkVersion makes a best-effort attempt to verify that fleetctl is at least as new as the
// latest fleet version found registered in the cluster. If any errors are encountered or fleetctl
// is >= the latest version found, it returns true. If it is < the latest found version, it returns
// false and a scary warning to the user.
func checkVersion() (string, bool) {
	fv := version.SemVersion
	lv, err := registryCtl.GetLatestVersion()
	if err != nil {
		log.Errorf("error attempting to check latest fleet version in Registry: %v", err)
	} else if lv != nil && fv.LessThan(*lv) {
		return fmt.Sprintf(oldVersionWarning, fv.String(), lv.String()), false
	return "", true
Beispiel #21
// loadJob hands the given Job to systemd without acquiring the
// state mutex. The caller is responsible for acquiring it.
func (a *Agent) loadJob(j *job.Job) {
	log.Infof("Loading Job(%s)", j.Name)
	a.state.SetTargetState(j.Name, job.JobStateLoaded)
	err :=, j.Unit)
	if err != nil {
		log.Errorf("Failed loading Job(%s): %v", j.Name, err)

	// We must explicitly refresh the payload state, as the dbus
	// event listener does not send an event when we write a unit
	// file to disk.
	us, err :=
	if err != nil {
		log.Errorf("Failed fetching state of Unit(%s): %v", j.Name, err)
	a.ReportUnitState(j.Name, us)
Beispiel #22
// Persist the state of the given Job into the Registry
func (a *Agent) ReportUnitState(jobName string, us *unit.UnitState) {
	if us == nil {
		err := a.registry.RemoveUnitState(jobName)
		if err != nil {
			log.Errorf("Failed to remove UnitState for job %s from Registry: %s", jobName, err.Error())
	} else {
		a.registry.SaveUnitState(jobName, us)
Beispiel #23
func (r *EtcdRegistry) getJobFromModel(jm jobModel) *job.Job {
	var err error
	var unit *unit.Unit

	// New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry
	if !jm.UnitHash.Empty() {
		unit = r.getUnitByHash(jm.UnitHash)
		if unit == nil {
			log.Warningf("No Unit found in Registry for Job(%s)", jm.Name)
			return nil
		if unit.Hash() != jm.UnitHash {
			log.Errorf("Unit Hash %s does not match expected %s for Job(%s)!", unit.Hash(), jm.UnitHash, jm.Name)
			return nil
		log.V(2).Infof("Got Unit for Job(%s) from registry", jm.Name)
	} else {
		// Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry
		log.V(2).Infof("Legacy Job(%s) has no PayloadHash - looking for associated Payload", jm.Name)
		unit, err = r.getUnitFromLegacyPayload(jm.Name)
		if err != nil {
			log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name)
			return nil
		} else if unit == nil {
			log.Warningf("No Payload found in Registry for Job(%s)", jm.Name)
			return nil

		log.Infof("Migrating legacy Payload(%s)", jm.Name)
		if err := r.storeOrGetUnit(*unit); err != nil {
			log.Warningf("Unable to migrate legacy Payload: %v", err)

	j := job.NewJob(jm.Name, *unit)

	j.UnitState = r.getUnitState(jm.Name)
	j.State = r.determineJobState(jm.Name)

	return j
Beispiel #24
func (r *EtcdRegistry) GetJobTargetState(jobName string) (*job.JobState, error) {
	key := r.jobTargetStatePath(jobName)
	resp, err := r.etcd.Get(key, false, false)
	if err != nil {
		if err.(*goetcd.EtcdError).ErrorCode != etcd.EcodeNodeExist {
			log.Errorf("Unable to determine target-state of Job(%s): %v", jobName, err)
		return nil, err

	return job.ParseJobState(resp.Node.Value), nil
Beispiel #25
func (j *Job) resourceFromKey(resKey string) int {
	valStr, ok := j.Requirements()[resKey]
	if ok && len(valStr) > 0 {
		val, err := strconv.Atoi(valStr[0])
		if err != nil {
			log.Errorf("failed to parse resource requirement %s from %s: %v", resKey, j.Name, err)
			return 0
		return val
	return 0
Beispiel #26
// getUnitByHash retrieves from the Registry the Unit associated with the given Hash
func (r *EtcdRegistry) getUnitByHash(hash unit.Hash) *unit.Unit {
	key := r.hashedUnitPath(hash)
	resp, err := r.etcd.Get(key, false, true)
	if err != nil {
		return nil
	var u unit.Unit
	if err := unmarshal(resp.Node.Value, &u); err != nil {
		log.Errorf("Error unmarshaling Unit(%s): %v", hash, err)
		return nil
	return &u
Beispiel #27
func (m *SystemdManager) stopUnit(name string) {
	log.V(1).Infof("Stopping systemd unit %s", name)

	if stat, err := m.Systemd.StopUnit(name, "replace"); err != nil {
		log.Errorf("Failed to stop systemd unit %s: %v", name, err)
	} else {
		log.Infof("Stopped systemd unit %s(%s)", name, stat)

	// go-systemd does not yet have this implemented
	//files := []string{name}
	//Systemd.DisableUnitFiles(files, true, false)
Beispiel #28
// Check is called during the handshake to check the server's public key for
// unexpected changes. The key argument is in SSH wire format. It can be parsed
// using ssh.ParsePublicKey. The address before DNS resolution is passed in the
// addr argument, so the key can also be checked against the hostname.
// It returns any error encountered while checking the public key. A nil return
// value indicates that the key was either successfully verified (against an
// existing known_hosts entry), or accepted by the user as a new key.
func (kc *HostKeyChecker) Check(addr string, remote net.Addr, key gossh.PublicKey) error {
	remoteAddr, err := kc.addrToHostPort(remote.String())
	if err != nil {
		return err

	algoStr := algoString(key.Type())
	keyFingerprintStr := md5String(md5.Sum(key.Marshal()))

	hostKeys, err := kc.m.GetHostKeys()
	_, ok := err.(*os.PathError)
	if err != nil && !ok {
		log.Errorf("Failed to read known_hosts file %v: %v", kc.m.String(), err)

	mismatched := false
	for pattern, keys := range hostKeys {
		if !matchHost(remoteAddr, pattern) {
		for _, hostKey := range keys {
			// Any matching key is considered a success, irrespective of previous failures
			if hostKey.Type() == key.Type() && bytes.Compare(hostKey.Marshal(), key.Marshal()) == 0 {
				return nil
			// TODO(jonboulle): could be super friendly like the OpenSSH client
			// and note exactly which key failed (file + line number)
			mismatched = true

	if mismatched {
		fmt.Fprintf(os.Stderr, warningRemoteHostChanged, algoStr, keyFingerprintStr, kc.m.String())
		return ErrUnmatchKey

	// If we get this far, we haven't matched on any of the hostname patterns,
	// so it's considered a new key. Prompt the user to trust it.
	if !kc.trustHost(remoteAddr, algoStr, keyFingerprintStr) {
		fmt.Fprintln(os.Stderr, "Host key verification failed.")
		return ErrUntrustHost

	if err := kc.m.PutHostKey(remoteAddr, key); err != nil {
		fmt.Fprintf(os.Stderr, "Failed to add the host to the list of known hosts (%v).\n", kc.m)
		return nil

	fmt.Fprintf(os.Stderr, "Warning: Permanently added '%v' (%v) to the list of known hosts.\n", remoteAddr, algoStr)
	return nil
Beispiel #29
// ReportUnitState attaches the current state of the Agent's Machine to the given
// unit.UnitState object, then persists that state in the Registry
func (a *Agent) ReportUnitState(jobName string, us *unit.UnitState) {
	if us == nil {
		log.V(1).Infof("Job(%s): purging UnitState from Registry", jobName)
		err := a.registry.RemoveUnitState(jobName)
		if err != nil {
			log.Errorf("Failed to remove UnitState for job %s from Registry: %s", jobName, err.Error())
	} else {
		ms := a.Machine.State()
		us.MachineState = &ms
		log.V(1).Infof("Job(%s): pushing UnitState (loadState=%s, activeState=%s, subState=%s) to Registry", jobName, us.LoadState, us.ActiveState, us.SubState)
		a.registry.SaveUnitState(jobName, us)
Beispiel #30
// stopJobUnlocked stops the indicated Job without acquiring the state
// mutex. The caller is responsible for acquiring it.
func (a *Agent) stopJobUnlocked(jobName string) {
	a.state.SetTargetState(jobName, job.JobStateLoaded)

	go func() {

		// We must explicitly refresh the payload state, as the dbus
		// event listener sends a nil event when a unit deactivates.
		us, err :=
		if err != nil {
			log.Errorf("Failed fetching state of Unit(%s): %v", jobName, err)
		a.ReportUnitState(jobName, us)