Пример #1
0
// Purge removes the Agent's state from the Registry
func (a *Agent) Purge() {
	// Continue heartbeating the agent's machine state while attempting to
	// stop all the locally-running jobs
	purged := make(chan bool)
	go a.heartbeatAgent(a.ttl, purged)

	a.state.Lock()
	scheduled := a.state.ScheduledJobs()
	a.state.Unlock()

	machID := a.Machine.State().ID
	for _, jobName := range scheduled {
		log.Infof("Unloading Job(%s) from local machine", jobName)
		a.unloadJob(jobName)
		log.Infof("Unscheduling Job(%s) from local machine", jobName)
		a.registry.ClearJobTarget(jobName, machID)
	}

	// Jobs have been stopped, the heartbeat can stop
	close(purged)

	log.Info("Removing Agent from Registry")
	if err := a.registry.RemoveMachineState(machID); err != nil {
		log.Errorf("Failed to remove Machine %s from Registry: %s", machID, err.Error())
	}
}
Пример #2
0
func (eh *EventHandler) handleLossOfMachineEvents(ev event.Event) {
	machID := ev.Payload.(string)
	mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID)
	if mutex == nil {
		log.V(1).Infof("%s(%s): failed to lock Machine, ignoring event", ev.Type, machID)
		return
	}
	defer mutex.Unlock()

	jobs := getJobsScheduledToMachine(eh.engine.registry, machID)

	for _, j := range jobs {
		log.Infof("%s(%s): clearing UnitState(%s)", ev.Type, machID, j.Name)
		err := eh.engine.registry.RemoveUnitState(j.Name)
		if err != nil {
			log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err)
		}

		log.Infof("%s(%s): unscheduling Job(%s)", ev.Type, machID, j.Name)
		eh.engine.registry.ClearJobTarget(j.Name, machID)
	}

	for _, j := range jobs {
		log.Infof("%s(%s): re-publishing JobOffer(%s)", ev.Type, machID, j.Name)
		eh.engine.OfferJob(j)
	}
	eh.engine.clust.machineRemoved(machID)
}
Пример #3
0
func (s *Server) Run() {
	log.Infof("Establishing etcd connectivity")

	var err error
	for sleep := time.Second; ; sleep = pkg.ExpBackoff(sleep, time.Minute) {
		_, err = s.hrt.Beat(s.mon.TTL)
		if err == nil {
			break
		}
		time.Sleep(sleep)
	}

	log.Infof("Starting server components")

	s.stop = make(chan bool)

	go s.Monitor()
	go s.api.Available(s.stop)
	go s.mach.PeriodicRefresh(machineStateRefreshInterval, s.stop)
	go s.agent.Heartbeat(s.stop)
	go s.aReconciler.Run(s.agent, s.stop)
	go s.engine.Run(s.engineReconcileInterval, s.stop)

	beatchan := make(chan *unit.UnitStateHeartbeat)
	go s.usGen.Run(beatchan, s.stop)
	go s.usPub.Run(beatchan, s.stop)
}
Пример #4
0
func (eh *EventHandler) HandleEventJobBidSubmitted(ev event.Event) {
	jb := ev.Payload.(job.JobBid)

	err := eh.engine.ResolveJobOffer(jb.JobName, jb.MachineID)
	if err == nil {
		log.Infof("EventJobBidSubmitted(%s): successfully scheduled Job to Machine(%s)", jb.JobName, jb.MachineID)
	} else {
		log.Infof("EventJobBidSubmitted(%s): failed to schedule Job to Machine(%s)", jb.JobName, jb.MachineID)
	}
}
Пример #5
0
func (eh *EventHandler) HandleCommandLoadJob(ev event.Event) {
	jobName := ev.Payload.(string)

	j, _ := eh.engine.registry.Job(jobName)
	if j == nil {
		log.Infof("CommandLoadJob(%s): asked to offer job that could not be found")
		return
	}

	log.Infof("CommandLoadJob(%s): publishing JobOffer", jobName)
	eh.engine.OfferJob(*j)
}
Пример #6
0
func (eh *EventHandler) HandleEventJobScheduled(ev event.Event) {
	jobName := ev.Payload.(string)
	target := ev.Context.(string)

	if target != eh.agent.Machine.State().ID {
		log.Infof("EventJobScheduled(%s): Job scheduled to other Machine(%s), informing Agent", jobName, target)
		eh.agent.JobScheduledElsewhere(jobName)
	} else {
		log.Infof("EventJobScheduled(%s): Job scheduled here, informing Agent", jobName)
		eh.agent.JobScheduledLocally(jobName)
	}
}
Пример #7
0
func getConfig(flagset *flag.FlagSet, userCfgFile string) (*config.Config, error) {
	opts := globalconf.Options{EnvPrefix: "FLEET_"}

	if userCfgFile != "" {
		// Fail hard if a user-provided config is not usable
		fi, err := os.Stat(userCfgFile)
		if err != nil {
			log.Fatalf("Unable to use config file %s: %v", userCfgFile, err)
		}
		if fi.IsDir() {
			log.Fatalf("Provided config %s is a directory, not a file", userCfgFile)
		}

		log.Infof("Using provided config file %s", userCfgFile)
		opts.Filename = userCfgFile

	} else if _, err := os.Stat(DefaultConfigFile); err == nil {
		log.Infof("Using default config file %s", DefaultConfigFile)
		opts.Filename = DefaultConfigFile
	} else {
		log.Infof("No provided or default config file found - proceeding without")
	}

	gconf, err := globalconf.NewWithOptions(&opts)
	if err != nil {
		return nil, err
	}

	gconf.ParseSet("", flagset)

	cfg := config.Config{
		Verbosity:          (*flagset.Lookup("verbosity")).Value.(flag.Getter).Get().(int),
		EtcdServers:        (*flagset.Lookup("etcd_servers")).Value.(flag.Getter).Get().(stringSlice),
		EtcdKeyPrefix:      (*flagset.Lookup("etcd_key_prefix")).Value.(flag.Getter).Get().(string),
		EtcdKeyFile:        (*flagset.Lookup("etcd_keyfile")).Value.(flag.Getter).Get().(string),
		EtcdCertFile:       (*flagset.Lookup("etcd_certfile")).Value.(flag.Getter).Get().(string),
		EtcdCAFile:         (*flagset.Lookup("etcd_cafile")).Value.(flag.Getter).Get().(string),
		PublicIP:           (*flagset.Lookup("public_ip")).Value.(flag.Getter).Get().(string),
		RawMetadata:        (*flagset.Lookup("metadata")).Value.(flag.Getter).Get().(string),
		AgentTTL:           (*flagset.Lookup("agent_ttl")).Value.(flag.Getter).Get().(string),
		VerifyUnits:        (*flagset.Lookup("verify_units")).Value.(flag.Getter).Get().(bool),
		AuthorizedKeysFile: (*flagset.Lookup("authorized_keys_file")).Value.(flag.Getter).Get().(string),
	}

	if cfg.VerifyUnits {
		log.Warning("WARNING: The signed/verified units feature is DEPRECATED and should not be used. It will be completely removed from fleet and fleetctl.")
	}

	config.UpdateLoggingFlagsFromConfig(flag.CommandLine, &cfg)

	return &cfg, nil
}
Пример #8
0
// JobScheduledElsewhere clears all state related to the indicated
// job before bidding for all oustanding jobs that can be run locally.
func (a *Agent) JobScheduledElsewhere(jobName string) {
	a.state.Lock()
	defer a.state.Unlock()

	log.Infof("Dropping offer and bid for Job(%s) from cache", jobName)
	a.state.PurgeOffer(jobName)

	log.Infof("Purging Job(%s) data from cache", jobName)
	a.state.PurgeJob(jobName)

	log.Infof("Checking outstanding job offers")
	a.bidForPossibleJobs()
}
Пример #9
0
// ReportUnitState attaches the current state of the Agent's Machine to the given
// unit.UnitState object, then persists that state in the Registry
func (a *Agent) ReportUnitState(jobName string, us *unit.UnitState) {
	if us == nil {
		log.Infof("Job(%s): purging UnitState from Registry", jobName)
		err := a.registry.RemoveUnitState(jobName)
		if err != nil {
			log.Errorf("Failed to remove UnitState for job %s from Registry: %s", jobName, err.Error())
		}
	} else {
		ms := a.Machine.State()
		us.MachineState = &ms
		log.Infof("Job(%s): pushing UnitState (loadState=%s, activeState=%s, subState=%s) to Registry", jobName, us.LoadState, us.ActiveState, us.SubState)
		a.registry.SaveUnitState(jobName, us)
	}
}
Пример #10
0
// Resolve attempts to yield a result from the configured action and endpoint. If a usable
// Result or error was not attained, nil values are returned.
func (ar *actionResolver) Resolve(cancel <-chan bool) (*Result, error) {
	resp, body, err := ar.exhaust(cancel)
	if err != nil {
		log.Infof("Failed getting response from %v: %v", ar.endpoint, err)
		return nil, nil
	}

	hdlr, ok := handlers[resp.StatusCode]
	if !ok {
		log.Infof("Response %s from %v unusable", resp.Status, ar.endpoint)
		return nil, nil
	}

	return hdlr(resp, body)
}
Пример #11
0
// JobUnscheduled attempts to unload the indicated job only
// if it were scheduled here in the first place, otherwise
// the event is ignored. If unloading is necessary, all jobs
// that can be run locally will also be bid upon.
func (a *Agent) JobUnscheduled(jobName string) {
	a.state.Lock()
	defer a.state.Unlock()

	if !a.state.ScheduledHere(jobName) {
		log.V(1).Infof("Job(%s) not scheduled here, ignoring", jobName)
		return
	}

	log.Infof("Unloading Job(%s)", jobName)
	a.unloadJob(jobName)

	log.Infof("Checking outstanding JobOffers")
	a.bidForPossibleJobs()
}
Пример #12
0
func (e *Engine) ResolveJobOffer(jobName string, machID string) error {
	log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName)
	mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID)

	if mutex == nil {
		log.V(1).Infof("Could not lock JobOffer(%s)", jobName)
		return errors.New("could not lock JobOffer")
	}
	defer mutex.Unlock()

	log.V(1).Infof("Claimed JobOffer(%s)", jobName)

	err := e.registry.ResolveJobOffer(jobName)
	if err != nil {
		log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err)
		return err
	}

	err = e.registry.ScheduleJob(jobName, machID)
	if err != nil {
		log.Errorf("Failed scheduling Job(%s): %v", jobName, err)
		return err
	}

	log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID)
	return nil
}
Пример #13
0
func (e *Engine) OfferJob(j job.Job) error {
	log.V(1).Infof("Attempting to lock Job(%s)", j.Name)

	mutex := e.registry.LockJob(j.Name, e.machine.State().ID)
	if mutex == nil {
		log.V(1).Infof("Could not lock Job(%s)", j.Name)
		return errors.New("could not lock Job")
	}
	defer mutex.Unlock()

	log.V(1).Infof("Claimed Job(%s)", j.Name)

	machineIDs, err := e.partitionCluster(&j)
	if err != nil {
		log.Errorf("failed partitioning cluster for Job(%s): %v", j.Name, err)
		return err
	}

	offer := job.NewOfferFromJob(j, machineIDs)

	err = e.registry.CreateJobOffer(offer)
	if err == nil {
		log.Infof("Published JobOffer(%s)", offer.Job.Name)
	}

	return err
}
Пример #14
0
func (m *SystemdUnitManager) stopUnit(name string) {
	if stat, err := m.systemd.StopUnit(name, "replace"); err != nil {
		log.Errorf("Failed to stop systemd unit %s: %v", name, err)
	} else {
		log.Infof("Stopped systemd unit %s(%s)", name, stat)
	}
}
Пример #15
0
func (r *EtcdRegistry) getJobFromModel(jm jobModel) *job.Job {
	var err error
	var unit *unit.Unit

	// New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry
	if !jm.UnitHash.Empty() {
		unit = r.getUnitByHash(jm.UnitHash)
		if unit == nil {
			log.Warningf("No Unit found in Registry for Job(%s)", jm.Name)
			return nil
		}
		if unit.Hash() != jm.UnitHash {
			log.Errorf("Unit Hash %s does not match expected %s for Job(%s)!", unit.Hash(), jm.UnitHash, jm.Name)
			return nil
		}
	} else {
		// Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry
		unit, err = r.getUnitFromLegacyPayload(jm.Name)
		if err != nil {
			log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name)
			return nil
		} else if unit == nil {
			log.Warningf("No Payload found in Registry for Job(%s)", jm.Name)
			return nil
		}

		log.Infof("Migrating legacy Payload(%s)", jm.Name)
		if err := r.storeOrGetUnit(*unit); err != nil {
			log.Warningf("Unable to migrate legacy Payload: %v", err)
		}
	}

	return job.NewJob(jm.Name, *unit)
}
Пример #16
0
// Submit a bid for the given Job
func (a *Agent) bid(jobName string) {
	log.Infof("Submitting JobBid for Job(%s)", jobName)

	jb := job.NewBid(jobName, a.Machine.State().ID)
	a.registry.SubmitJobBid(jb)

	a.state.TrackBid(jb.JobName)
}
Пример #17
0
// MaybeBid bids for the given JobOffer only if the Agent determines that it is able
// to run the JobOffer's Job
func (a *Agent) MaybeBid(jo job.JobOffer) {
	a.state.Lock()
	defer a.state.Unlock()

	// Everything we check against could change over time, so we track all
	// offers starting here for future bidding even if we can't bid now
	a.state.TrackOffer(jo)
	a.state.TrackJob(&jo.Job)

	if !a.ableToRun(&jo.Job) {
		log.Infof("EventJobOffered(%s): not all criteria met, not bidding", jo.Job.Name)
		return
	}

	log.Infof("EventJobOffered(%s): passed all criteria, submitting JobBid", jo.Job.Name)
	a.bid(jo.Job.Name)
}
Пример #18
0
func (m *SystemdUnitManager) removeUnit(name string) {
	log.Infof("Removing systemd unit %s", name)

	m.systemd.DisableUnitFiles([]string{name}, true)

	ufPath := m.getUnitFilePath(name)
	os.Remove(ufPath)
}
Пример #19
0
func (r *EtcdRegistry) getJobFromObjectNode(node *etcd.Node) (*job.Job, error) {
	var err error
	var jm jobModel
	if err = unmarshal(node.Value, &jm); err != nil {
		return nil, err
	}

	var unit *unit.Unit

	// New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry
	if !jm.UnitHash.Empty() {
		unit = r.getUnitByHash(jm.UnitHash)
		if unit == nil {
			log.Warningf("No Unit found in Registry for Job(%s)", jm.Name)
			return nil, nil
		}
		if unit.Hash() != jm.UnitHash {
			log.Errorf("Unit Hash %s does not match expected %s for Job(%s)!", unit.Hash(), jm.UnitHash, jm.Name)
			return nil, nil
		}
	} else {
		// Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry
		unit, err = r.getUnitFromLegacyPayload(jm.Name)
		if err != nil {
			log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name)
			return nil, nil
		} else if unit == nil {
			log.Warningf("No Payload found in Registry for Job(%s)", jm.Name)
			return nil, nil
		}

		log.Infof("Migrating legacy Payload(%s)", jm.Name)
		if err := r.storeOrGetUnit(*unit); err != nil {
			log.Warningf("Unable to migrate legacy Payload: %v", err)
		}

		jm.UnitHash = unit.Hash()
		log.Infof("Updating Job(%s) with legacy payload Hash(%s)", jm.Name, jm.UnitHash)
		if err := r.updateJobObjectNode(&jm, node.ModifiedIndex); err != nil {
			log.Warningf("Unable to update Job(%s) with legacy payload Hash(%s): %v", jm.Name, jm.UnitHash, err)
		}
	}

	return job.NewJob(jm.Name, *unit), nil
}
Пример #20
0
// ableToRun determines if the Agent can run the provided Job, and returns a boolean indicating
// whether this is the case. There are five criteria for an Agent to be eligible to run a Job:
//   - Job must pass signature verification
//   - agent must have all of the Job's required metadata (if any)
//   - agent must meet the Job's machine target requirement (if any)
//   - agent must have all required Peers of the Job scheduled locally (if any)
//   - Job must not conflict with any other Jobs scheduled to the agent
func (a *Agent) ableToRun(j *job.Job) bool {
	if !a.verifyJobSignature(j) {
		log.V(1).Infof("Failed to verify Job(%s)", j.Name)
		return false
	}

	log.Infof("Job(%s) has requirements: %s", j.Name, j.Requirements())

	metadata := j.RequiredTargetMetadata()
	if len(metadata) == 0 {
		log.V(1).Infof("Job(%s) has no required machine metadata", j.Name)
	} else {
		log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata)
		ms := a.Machine.State()
		if !machine.HasMetadata(&ms, metadata) {
			log.Infof("Unable to run Job(%s): local Machine metadata insufficient", j.Name)
			return false
		}
	}

	if tgt, ok := j.RequiredTarget(); ok && !a.Machine.State().MatchID(tgt) {
		log.Infof("Unable to run Job(%s): agent does not meet machine target requirement (%s)", j.Name, tgt)
		return false
	}

	peers := j.Peers()
	if len(peers) == 0 {
		log.V(1).Infof("Job(%s) has no required peers", j.Name)
	} else {
		log.V(1).Infof("Job(%s) requires peers: %v", j.Name, peers)
		for _, peer := range peers {
			if !a.peerScheduledHere(j.Name, peer) {
				log.Infof("Unable to run Job(%s): required Peer(%s) is not scheduled locally", j.Name, peer)
				return false
			}
		}
	}

	if conflicted, conflictedJobName := a.HasConflict(j.Name, j.Conflicts()); conflicted {
		log.Infof("Unable to run Job(%s): conflict with Job(%s)", j.Name, conflictedJobName)
		return false
	}

	return true
}
Пример #21
0
func (e *Engine) resolveJobOffer(jName string) (err error) {
	err = e.registry.ResolveJobOffer(jName)
	if err != nil {
		log.Errorf("Failed resolving JobOffer(%s): %v", jName, err)
	} else {
		log.Infof("Resolved JobOffer(%s)", jName)
	}
	return
}
Пример #22
0
func (e *Engine) unscheduleJob(jName, machID string) (err error) {
	err = e.registry.ClearJobTarget(jName, machID)
	if err != nil {
		log.Errorf("Failed clearing target Machine(%s) of Job(%s): %v", machID, jName, err)
	} else {
		log.Infof("Unscheduled Job(%s) from Machine(%s)", jName, machID)
	}
	return
}
Пример #23
0
func (eh *EventHandler) HandleCommandUnloadJob(ev event.Event) {
	jobName := ev.Payload.(string)
	target := ev.Context.(string)

	if target != "" {
		log.Infof("CommandUnloadJob(%s): clearing scheduling decision", jobName)
		eh.engine.registry.ClearJobTarget(jobName, target)
	}
}
Пример #24
0
func (e *Engine) offerJob(j *job.Job) (err error) {
	offer := job.NewOfferFromJob(*j)
	err = e.registry.CreateJobOffer(offer)
	if err != nil {
		log.Errorf("Failed publishing JobOffer(%s): %v", j.Name, err)
	} else {
		log.Infof("Published JobOffer(%s)", j.Name)
	}
	return
}
Пример #25
0
func (eh *EventHandler) HandleEventJobOffered(ev event.Event) {
	jo := ev.Payload.(job.JobOffer)

	if !jo.OfferedTo(eh.agent.Machine.State().ID) {
		log.V(1).Infof("EventJobOffered(%s): not offered to this machine, ignoring", jo.Job.Name)
		return
	}

	log.Infof("EventJobOffered(%s): deciding whether to bid or not", jo.Job.Name)
	eh.agent.MaybeBid(jo)
}
Пример #26
0
func (eh *EventHandler) HandleEventUnitStateUpdated(ev event.Event) {
	jobName := ev.Context.(string)
	state := ev.Payload.(*unit.UnitState)

	if state == nil {
		log.V(1).Infof("EventUnitStateUpdated(%s): received nil UnitState object, ignoring", jobName)
		return
	}

	log.Infof("EventUnitStateUpdated(%s): reporting state to Registry", jobName)
	eh.agent.ReportUnitState(jobName, state)
}
Пример #27
0
func (eh *EventHandler) HandleCommandStopJob(ev event.Event) {
	jobName := ev.Payload.(string)
	target := ev.Context.(string)

	if target != eh.agent.Machine.State().ID {
		log.V(1).Infof("CommandStopJob(%s): scheduled elsewhere, ignoring", jobName)
		return
	}

	log.Infof("CommandStopJob(%s): instructing Agent to stop Job", jobName)
	eh.agent.StopJob(jobName)
}
Пример #28
0
func (m *SystemdUnitManager) writeUnit(name string, contents string) error {
	log.Infof("Writing systemd unit %s", name)

	ufPath := m.getUnitFilePath(name)
	err := ioutil.WriteFile(ufPath, []byte(contents), os.FileMode(0644))
	if err != nil {
		return err
	}

	_, err = m.systemd.LinkUnitFiles([]string{ufPath}, true, true)
	return err
}
Пример #29
0
// JobScheduledLocally clears all state related to the indicated
// job's offers/bids before attempting to load and possibly start
// the job. The ability to run the job will be revalidated before
// loading, and unscheduled if such validation fails.
func (a *Agent) JobScheduledLocally(jobName string) {
	a.state.Lock()
	defer a.state.Unlock()

	log.Infof("Dropping offer and bid for Job(%s) from cache", jobName)
	a.state.PurgeOffer(jobName)

	j, err := a.registry.Job(jobName)
	if err != nil {
		log.Errorf("Failed fetching Job(%s) from Registry: %v", jobName, err)
		return
	}

	if j == nil {
		log.Errorf("Unable to find Job(%s) in Registry", jobName)
		return
	}

	if !a.ableToRun(j) {
		log.Infof("Unable to run locally-scheduled Job(%s), unscheduling", jobName)
		a.registry.ClearJobTarget(jobName, a.Machine.State().ID)
		a.state.PurgeJob(jobName)
		return
	}

	a.loadJob(j)

	log.Infof("Bidding for all possible peers of Job(%s)", j.Name)
	a.bidForPossiblePeers(j.Name)

	if j.TargetState == nil || *j.TargetState != job.JobStateLaunched {
		return
	}

	log.Infof("Job(%s) loaded, now starting it", j.Name)
	a.startJobUnlocked(j.Name)
}
Пример #30
0
// Jobs lists all Jobs known by the Registry, ordered by job name
func (r *EtcdRegistry) Jobs() ([]job.Job, error) {
	var jobs []job.Job

	req := etcd.Get{
		Key:       path.Join(r.keyPrefix, jobPrefix),
		Sorted:    true,
		Recursive: true,
	}

	resp, err := r.etcd.Do(&req)
	if err != nil {
		if isKeyNotFound(err) {
			err = nil
		}
		return jobs, err
	}

	for _, dir := range resp.Node.Nodes {
		objKey := path.Join(dir.Key, "object")
		var obj *etcd.Node
		for _, node := range dir.Nodes {
			if node.Key != objKey {
				continue
			}
			node := node
			obj = &node
		}

		if obj == nil {
			continue
		}

		j, err := r.getJobFromObjectNode(obj)
		if j == nil || err != nil {
			log.Infof("Unable to parse Job in Registry at key %s", obj.Key)
			continue
		}

		if err = r.parseJobDir(j, &dir); err != nil {
			log.Errorf("Failed to parse Job(%s) model: %v", j.Name, err)
			continue
		}

		jobs = append(jobs, *j)
	}

	return jobs, nil
}