// Purge removes the Agent's state from the Registry func (a *Agent) Purge() { // Continue heartbeating the agent's machine state while attempting to // stop all the locally-running jobs purged := make(chan bool) go a.heartbeatAgent(a.ttl, purged) a.state.Lock() scheduled := a.state.ScheduledJobs() a.state.Unlock() machID := a.Machine.State().ID for _, jobName := range scheduled { log.Infof("Unloading Job(%s) from local machine", jobName) a.unloadJob(jobName) log.Infof("Unscheduling Job(%s) from local machine", jobName) a.registry.ClearJobTarget(jobName, machID) } // Jobs have been stopped, the heartbeat can stop close(purged) log.Info("Removing Agent from Registry") if err := a.registry.RemoveMachineState(machID); err != nil { log.Errorf("Failed to remove Machine %s from Registry: %s", machID, err.Error()) } }
func (eh *EventHandler) HandleEventJobOffered(ev event.Event) { jo := ev.Payload.(job.JobOffer) log.Infof("EventJobOffered(%s): verifying ability to run Job", jo.Job.Name) if !jo.OfferedTo(eh.agent.Machine().State().ID) { log.Infof("EventJobOffered(%s): not offered to this machine", jo.Job.Name) return } eh.agent.state.Lock() defer eh.agent.state.Unlock() // Everything we check against could change over time, so we track all // offers starting here for future bidding even if we can't bid now eh.agent.state.TrackOffer(jo) eh.agent.state.TrackJob(&jo.Job) if !eh.agent.AbleToRun(&jo.Job) { log.Infof("EventJobOffered(%s): not all criteria met, not bidding", jo.Job.Name) return } log.Infof("EventJobOffered(%s): passed all criteria, submitting JobBid", jo.Job.Name) eh.agent.Bid(jo.Job.Name) }
func (eh *EventHandler) HandleEventMachineRemoved(ev event.Event) { machID := ev.Payload.(string) mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID) if mutex == nil { log.V(1).Infof("EventMachineRemoved(%s): failed to lock Machine, ignoring event", machID) return } defer mutex.Unlock() jobs := getJobsScheduledToMachine(eh.engine.registry, machID) for _, j := range jobs { log.Infof("EventMachineRemoved(%s): clearing UnitState(%s)", machID, j.Name) err := eh.engine.registry.RemoveUnitState(j.Name) if err != nil { log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err) } log.Infof("EventMachineRemoved(%s): unscheduling Job(%s)", machID, j.Name) eh.engine.registry.ClearJobTarget(j.Name, machID) } for _, j := range jobs { log.Infof("EventMachineRemoved(%s): re-publishing JobOffer(%s)", machID, j.Name) eh.engine.OfferJob(j) } eh.engine.clust.machineRemoved(machID) }
// JobScheduledLocally clears all state related to the indicated // job's offers/bids before attempting to load and possibly start // the job. The ability to run the job will be revalidated before // loading, and unscheduled if such validation fails. func (a *Agent) JobScheduledLocally(jobName string) { a.state.Lock() defer a.state.Unlock() log.Infof("Dropping offer and bid for Job(%s) from cache", jobName) a.state.PurgeOffer(jobName) j := a.fetchJob(jobName) if j == nil { log.Errorf("Failed to fetch Job(%s)", jobName) return } if !a.ableToRun(j) { log.Infof("Unable to run locally-scheduled Job(%s), unscheduling", jobName) a.registry.ClearJobTarget(jobName, a.Machine.State().ID) a.state.PurgeJob(jobName) return } a.loadJob(j) log.Infof("Bidding for all possible peers of Job(%s)", j.Name) a.bidForPossiblePeers(j.Name) ts, _ := a.registry.GetJobTargetState(j.Name) if ts == nil || *ts != job.JobStateLaunched { return } log.Infof("Job(%s) loaded, now starting it", j.Name) a.startJobUnlocked(j.Name) }
func (m *SystemdManager) removeUnit(name string) { log.Infof("Unlinking systemd unit %s from target %s", name, m.Target.Name()) link := m.getLocalPath(path.Join(m.Target.Name()+".wants", name)) syscall.Unlink(link) file := m.getLocalPath(name) log.Infof("Removing systemd unit file %s", file) syscall.Unlink(file) }
func (eh *EventHandler) HandleEventJobBidSubmitted(ev event.Event) { jb := ev.Payload.(job.JobBid) err := eh.engine.ResolveJobOffer(jb.JobName, jb.MachineID) if err == nil { log.Infof("EventJobBidSubmitted(%s): successfully scheduled Job to Machine(%s)", jb.JobName, jb.MachineID) } else { log.Infof("EventJobBidSubmitted(%s): failed to schedule Job to Machine(%s)", jb.JobName, jb.MachineID) } }
func (eh *EventHandler) HandleCommandLoadJob(ev event.Event) { jobName := ev.Payload.(string) j := eh.engine.registry.GetJob(jobName) if j == nil { log.Infof("CommandLoadJob(%s): asked to offer job that could not be found") return } log.Infof("CommandLoadJob(%s): publishing JobOffer", jobName) eh.engine.OfferJob(*j) }
func (eh *EventHandler) HandleEventJobScheduled(ev event.Event) { jobName := ev.Payload.(string) target := ev.Context.(string) if target != eh.agent.Machine.State().ID { log.Infof("EventJobScheduled(%s): Job scheduled to other Machine(%s), informing Agent", jobName, target) eh.agent.JobScheduledElsewhere(jobName) } else { log.Infof("EventJobScheduled(%s): Job scheduled here, informing Agent", jobName) eh.agent.JobScheduledLocally(jobName) } }
// JobScheduledElsewhere clears all state related to the indicated // job before bidding for all oustanding jobs that can be run locally. func (a *Agent) JobScheduledElsewhere(jobName string) { a.state.Lock() defer a.state.Unlock() log.Infof("Dropping offer and bid for Job(%s) from cache", jobName) a.state.PurgeOffer(jobName) log.Infof("Purging Job(%s) data from cache", jobName) a.state.PurgeJob(jobName) log.Infof("Checking outstanding job offers") a.bidForPossibleJobs() }
// JobUnscheduled attempts to unload the indicated job only // if it were scheduled here in the first place, otherwise // the event is ignored. If unloading is necessary, all jobs // that can be run locally will also be bid upon. func (a *Agent) JobUnscheduled(jobName string) { a.state.Lock() defer a.state.Unlock() if !a.state.ScheduledHere(jobName) { log.V(1).Infof("Job(%s) not scheduled here, ignoring", jobName) return } log.Infof("Unloading Job(%s)", jobName) a.unloadJob(jobName) log.Infof("Checking outstanding JobOffers") a.bidForPossibleJobs() }
func getConfig(flagset *flag.FlagSet, file string) (*config.Config, error) { if _, err := os.Stat(file); err != nil { glog.Infof("Config file %s does not appear to exist - ignoring") file = "" } opts := globalconf.Options{ EnvPrefix: "FLEET_", ConfigFile: file, } gconf, err := globalconf.NewWithOptions(opts) if err != nil { return nil, err } gconf.ParseSet("", flagset) cfg := config.NewConfig() cfg.Verbosity = (*flagset.Lookup("verbosity")).Value.(flag.Getter).Get().(int) cfg.EtcdServers = (*flagset.Lookup("etcd_servers")).Value.(flag.Getter).Get().(stringSlice) cfg.BootId = (*flagset.Lookup("boot_id")).Value.(flag.Getter).Get().(string) cfg.PublicIP = (*flagset.Lookup("public_ip")).Value.(flag.Getter).Get().(string) cfg.RawMetadata = (*flagset.Lookup("metadata")).Value.(flag.Getter).Get().(string) cfg.UnitPrefix = (*flagset.Lookup("unit_prefix")).Value.(flag.Getter).Get().(string) cfg.AgentTTL = (*flagset.Lookup("agent_ttl")).Value.(flag.Getter).Get().(string) return cfg, nil }
// Instruct the Agent to stop the provided Job and // all of its peers func (a *Agent) StopJob(jobName string) { log.Infof("Stopping Job(%s)", jobName) a.systemd.StopJob(jobName) a.ReportJobState(jobName, nil) a.state.Lock() reversePeers := a.state.GetJobsByPeer(jobName) a.state.DropPeersJob(jobName) a.state.DropJobConflicts(jobName) a.state.Unlock() for _, peer := range reversePeers { log.Infof("Stopping Peer(%s) of Job(%s)", peer, jobName) a.registry.StopJob(peer) } }
func (e *Engine) ResolveJobOffer(jobName string, machID string) error { log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName) mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock JobOffer(%s)", jobName) return errors.New("Could not lock JobOffer") } defer mutex.Unlock() log.V(1).Infof("Claimed JobOffer(%s)", jobName) err := e.registry.ResolveJobOffer(jobName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err) return err } err = e.registry.ScheduleJob(jobName, machID) if err != nil { log.Errorf("Failed scheduling Job(%s): %v", jobName, err) return err } log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID) return nil }
func (e *Engine) OfferJob(j job.Job) error { log.V(1).Infof("Attempting to lock Job(%s)", j.Name) mutex := e.registry.LockJob(j.Name, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock Job(%s)", j.Name) return errors.New("Could not lock Job") } defer mutex.Unlock() log.V(1).Infof("Claimed Job(%s)", j.Name) machineIDs, err := e.partitionCluster(&j) if err != nil { log.Errorf("Failed partitioning cluster for Job(%s): %v", j.Name, err) return err } offer := job.NewOfferFromJob(j, machineIDs) err = e.registry.CreateJobOffer(offer) if err == nil { log.Infof("Published JobOffer(%s)", offer.Job.Name) } return err }
func (self *Engine) OfferJob(j job.Job) error { log.V(2).Infof("Attempting to lock Job(%s)", j.Name) mutex := self.lockJob(j.Name) if mutex == nil { log.V(1).Infof("Could not lock Job(%s)", j.Name) return errors.New("Could not lock Job") } defer mutex.Unlock() log.V(1).Infof("Claimed Job", j.Name) machineBootIds, err := self.partitionCluster(&j) if err != nil { log.Errorf("Failed partitioning cluster for Job(%s): %v", j.Name, err) return err } offer := job.NewOfferFromJob(j, machineBootIds) log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name) self.registry.CreateJobOffer(offer) log.Infof("Published JobOffer(%s)", offer.Job.Name) return nil }
func (self *Engine) ResolveJobOffer(jobName string, machBootId string) error { log.V(2).Infof("Attempting to lock JobOffer(%s)", jobName) mutex := self.lockJobOffer(jobName) if mutex == nil { log.V(2).Infof("Could not lock JobOffer(%s)", jobName) return errors.New("Could not lock JobOffer") } defer mutex.Unlock() log.V(2).Infof("Claimed JobOffer(%s)", jobName) log.V(2).Infof("Resolving JobOffer(%s), scheduling to Machine(%s)", jobName, machBootId) err := self.registry.ResolveJobOffer(jobName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err) return err } err = self.registry.ScheduleJob(jobName, machBootId) if err != nil { log.Errorf("Failed scheduling Job(%s): %v", jobName, err) return err } log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machBootId) return nil }
func (m *SystemdManager) stopUnit(name string) { if stat, err := m.Systemd.StopUnit(name, "replace"); err != nil { log.Errorf("Failed to stop systemd unit %s: %v", name, err) } else { log.Infof("Stopped systemd unit %s(%s)", name, stat) } }
func (eh *EventHandler) HandleEventUnitStateUpdated(ev event.Event) { jobName := ev.Context.(string) state := ev.Payload.(*unit.UnitState) if state == nil { log.Infof("EventUnitStateUpdated(%s): received nil UnitState object", jobName) state, _ = eh.agent.systemd.GetUnitState(jobName) } log.Infof("EventUnitStateUpdated(%s): pushing state (loadState=%s, activeState=%s, subState=%s) to Registry", jobName, state.LoadState, state.ActiveState, state.SubState) // FIXME: This should probably be set in the underlying event-generation code ms := eh.agent.Machine().State() state.MachineState = &ms eh.agent.ReportUnitState(jobName, state) }
// Determine if the Agent can run the provided Job func (a *Agent) ableToRun(j *job.Job) bool { if !a.verifyJob(j) { log.V(1).Infof("Failed to verify Job(%s)", j.Name) return false } requirements := j.Requirements() if len(requirements) == 0 { log.V(1).Infof("Job(%s) has no requirements", j.Name) } log.Infof("Job(%s) has requirements: %s", j.Name, requirements) metadata := j.RequiredTargetMetadata() log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata) ms := a.Machine.State() if !machine.HasMetadata(&ms, metadata) { log.Infof("Unable to run Job(%s), local Machine metadata insufficient", j.Name) return false } if tgt, ok := j.RequiredTarget(); ok && !a.Machine.State().MatchID(tgt) { log.Infof("Agent does not meet machine target requirement for Job(%s)", j.Name) return false } peers := j.Peers() if len(peers) > 0 { log.V(1).Infof("Asserting required Peers %v of Job(%s) are scheduled locally", peers, j.Name) for _, peer := range peers { if !a.peerScheduledHere(j.Name, peer) { log.Infof("Required Peer(%s) of Job(%s) is not scheduled locally", peer, j.Name) return false } } } else { log.V(1).Infof("Job(%s) has no peers to worry about", j.Name) } if conflicted, conflictedJobName := a.HasConflict(j.Name, j.Conflicts()); conflicted { log.Infof("Job(%s) has conflict with Job(%s)", j.Name, conflictedJobName) return false } return true }
// Submit a bid for the given Job func (a *Agent) bid(jobName string) { log.Infof("Submitting JobBid for Job(%s)", jobName) jb := job.NewBid(jobName, a.Machine.State().ID) a.registry.SubmitJobBid(jb) a.state.TrackBid(jb.JobName) }
// MaybeBid determines bids for the given JobOffer only if it the Agent // determines that it is able to run the JobOffer's Job func (a *Agent) MaybeBid(jo job.JobOffer) { a.state.Lock() defer a.state.Unlock() // Everything we check against could change over time, so we track all // offers starting here for future bidding even if we can't bid now a.state.TrackOffer(jo) a.state.TrackJob(&jo.Job) if !a.ableToRun(&jo.Job) { log.Infof("EventJobOffered(%s): not all criteria met, not bidding", jo.Job.Name) return } log.Infof("EventJobOffered(%s): passed all criteria, submitting JobBid", jo.Job.Name) a.bid(jo.Job.Name) }
func (m *SystemdManager) removeUnit(name string) { log.Infof("Removing systemd unit %s", name) m.Systemd.DisableUnitFiles([]string{name}, true) ufPath := getUnitFilePath(name) os.Remove(ufPath) }
// Determine if the Agent can run the provided Job func (a *Agent) AbleToRun(j *job.Job) bool { if !a.VerifyJob(j) { log.V(1).Infof("Failed to verify Job(%s)", j.Name) return false } requirements := j.Requirements() if len(requirements) == 0 { log.V(1).Infof("Job(%s) has no requirements", j.Name) return true } if log.V(1) { var reqString string for key, slice := range requirements { reqString += fmt.Sprintf("%s = [", key) for _, val := range slice { reqString += fmt.Sprintf("%s, ", val) } reqString += fmt.Sprint("] ") } log.Infof("Job(%s) has requirements: %s", j.Name, reqString) } metadata := extractMachineMetadata(requirements) log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata) if !a.machine.HasMetadata(metadata) { log.V(1).Infof("Unable to run Job(%s), local Machine metadata insufficient", j.Name) return false } bootID, ok := requirements[unit.FleetXConditionMachineBootID] if ok && len(bootID) > 0 && !a.machine.State().MatchBootID(bootID[0]) { log.V(1).Infof("Agent does not pass MachineBootID condition for Job(%s)", j.Name) return false } peers := j.Payload.Peers() if len(peers) > 0 { log.V(1).Infof("Asserting required Peers %v of Job(%s) are scheduled locally", peers, j.Name) for _, peer := range peers { if !a.peerScheduledHere(j.Name, peer) { log.V(1).Infof("Required Peer(%s) of Job(%s) is not scheduled locally", peer, j.Name) return false } } } else { log.V(2).Infof("Job(%s) has no peers to worry about", j.Name) } if conflicted, conflictedJobName := a.state.HasConflict(j.Name, j.Payload.Conflicts()); conflicted { log.V(1).Infof("Job(%s) has conflict with Job(%s)", j.Name, conflictedJobName) return false } return true }
func getConfig(flagset *flag.FlagSet, userCfgFile string) (*config.Config, error) { opts := globalconf.Options{EnvPrefix: "FLEET_"} if userCfgFile != "" { // Fail hard if a user-provided config is not usable fi, err := os.Stat(userCfgFile) if err != nil { log.Fatalf("Unable to use config file %s: %v", userCfgFile, err) } if fi.IsDir() { log.Fatalf("Provided config %s is a directory, not a file", userCfgFile) } log.Infof("Using provided config file %s", userCfgFile) opts.Filename = userCfgFile } else if _, err := os.Stat(DefaultConfigFile); err == nil { log.Infof("Using default config file %s", DefaultConfigFile) opts.Filename = DefaultConfigFile } else { log.Infof("No provided or default config file found - proceeding without") } gconf, err := globalconf.NewWithOptions(&opts) if err != nil { return nil, err } gconf.ParseSet("", flagset) cfg := config.Config{ Verbosity: (*flagset.Lookup("verbosity")).Value.(flag.Getter).Get().(int), EtcdServers: (*flagset.Lookup("etcd_servers")).Value.(flag.Getter).Get().(stringSlice), EtcdKeyPrefix: (*flagset.Lookup("etcd_key_prefix")).Value.(flag.Getter).Get().(string), PublicIP: (*flagset.Lookup("public_ip")).Value.(flag.Getter).Get().(string), RawMetadata: (*flagset.Lookup("metadata")).Value.(flag.Getter).Get().(string), AgentTTL: (*flagset.Lookup("agent_ttl")).Value.(flag.Getter).Get().(string), VerifyUnits: (*flagset.Lookup("verify_units")).Value.(flag.Getter).Get().(bool), AuthorizedKeysFile: (*flagset.Lookup("authorized_keys_file")).Value.(flag.Getter).Get().(string), } config.UpdateLoggingFlagsFromConfig(flag.CommandLine, &cfg) return &cfg, nil }
// ableToRun determines if the Agent can run the provided Job, and returns a boolean indicating // whether this is the case. There are five criteria for an Agent to be eligible to run a Job: // - Job must pass signature verification // - agent must have all of the Job's required metadata (if any) // - agent must meet the Job's machine target requirement (if any) // - agent must have all required Peers of the Job scheduled locally (if any) // - Job must not conflict with any other Jobs scheduled to the agent func (a *Agent) ableToRun(j *job.Job) bool { if !a.verifyJobSignature(j) { log.V(1).Infof("Failed to verify Job(%s)", j.Name) return false } log.Infof("Job(%s) has requirements: %s", j.Name, j.Requirements()) metadata := j.RequiredTargetMetadata() if len(metadata) == 0 { log.V(1).Infof("Job(%s) has no required machine metadata", j.Name) } else { log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata) ms := a.Machine.State() if !machine.HasMetadata(&ms, metadata) { log.Infof("Unable to run Job(%s): local Machine metadata insufficient", j.Name) return false } } if tgt, ok := j.RequiredTarget(); ok && !a.Machine.State().MatchID(tgt) { log.Infof("Unable to run Job(%s): agent does not meet machine target requirement (%s)", j.Name, tgt) return false } peers := j.Peers() if len(peers) == 0 { log.V(1).Infof("Job(%s) has no required peers", j.Name) } else { log.V(1).Infof("Job(%s) requires peers: %v", j.Name, peers) for _, peer := range peers { if !a.peerScheduledHere(j.Name, peer) { log.Infof("Unable to run Job(%s): required Peer(%s) is not scheduled locally", j.Name, peer) return false } } } if conflicted, conflictedJobName := a.HasConflict(j.Name, j.Conflicts()); conflicted { log.Infof("Unable to run Job(%s): conflict with Job(%s)", j.Name, conflictedJobName) return false } return true }
func (eh *EventHandler) HandleEventJobUnscheduled(ev event.Event) { jobName := ev.Payload.(string) target := ev.Context.(string) if target != eh.agent.Machine().State().ID { log.Infof("EventJobUnscheduled(%s): not scheduled here, ignoring ", jobName) return } eh.agent.state.Lock() defer eh.agent.state.Unlock() log.Infof("EventJobUnscheduled(%s): unloading job", jobName) eh.agent.UnloadJob(jobName) log.Infof("EventJobUnscheduled(%s): checking outstanding job offers", jobName) eh.agent.BidForPossibleJobs() }
// Inform the Registry that a Job must be rescheduled func (a *Agent) RescheduleJob(j *job.Job) { log.V(2).Infof("Stopping Job(%s)", j.Name) a.registry.UnscheduleJob(j.Name) offer := job.NewOfferFromJob(*j) log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name) a.registry.CreateJobOffer(offer) log.Infof("Published JobOffer(%s)", offer.Job.Name) }
func (eh *EventHandler) HandleCommandUnloadJob(ev event.Event) { jobName := ev.Payload.(string) target := ev.Context.(string) if target != "" { log.Infof("CommandUnloadJob(%s): clearing scheduling decision", jobName) eh.engine.registry.ClearJobTarget(jobName, target) } }
func (eh *EventHandler) HandleEventJobDestroyed(ev event.Event) { jobName := ev.Payload.(string) eh.agent.state.Lock() defer eh.agent.state.Unlock() log.Infof("EventJobDestroyed(%s): unloading corresponding unit", jobName) eh.agent.UnloadJob(jobName) }
func (m *SystemdManager) stopUnit(name string) { log.V(1).Infof("Stopping systemd unit %s", name) m.Systemd.StopUnit(name, "replace") log.Infof("Stopped systemd unit %s", name) // go-systemd does not yet have this implemented //files := []string{name} //Systemd.DisableUnitFiles(files, true, false) }