// Purge removes the Agent's state from the Registry func (a *Agent) Purge() { // Continue heartbeating the agent's machine state while attempting to // stop all the locally-running jobs purged := make(chan bool) go a.heartbeatAgent(a.ttl, purged) a.state.Lock() scheduled := a.state.ScheduledJobs() a.state.Unlock() machID := a.Machine.State().ID for _, jobName := range scheduled { log.Infof("Unloading Job(%s) from local machine", jobName) a.unloadJob(jobName) log.Infof("Unscheduling Job(%s) from local machine", jobName) a.registry.ClearJobTarget(jobName, machID) } // Jobs have been stopped, the heartbeat can stop close(purged) log.Info("Removing Agent from Registry") if err := a.registry.RemoveMachineState(machID); err != nil { log.Errorf("Failed to remove Machine %s from Registry: %s", machID, err.Error()) } }
func (eh *EventHandler) handleLossOfMachineEvents(ev event.Event) { machID := ev.Payload.(string) mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID) if mutex == nil { log.V(1).Infof("%s(%s): failed to lock Machine, ignoring event", ev.Type, machID) return } defer mutex.Unlock() jobs := getJobsScheduledToMachine(eh.engine.registry, machID) for _, j := range jobs { log.Infof("%s(%s): clearing UnitState(%s)", ev.Type, machID, j.Name) err := eh.engine.registry.RemoveUnitState(j.Name) if err != nil { log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err) } log.Infof("%s(%s): unscheduling Job(%s)", ev.Type, machID, j.Name) eh.engine.registry.ClearJobTarget(j.Name, machID) } for _, j := range jobs { log.Infof("%s(%s): re-publishing JobOffer(%s)", ev.Type, machID, j.Name) eh.engine.OfferJob(j) } eh.engine.clust.machineRemoved(machID) }
func (s *Server) Run() { log.Infof("Establishing etcd connectivity") var err error for sleep := time.Second; ; sleep = pkg.ExpBackoff(sleep, time.Minute) { _, err = s.hrt.Beat(s.mon.TTL) if err == nil { break } time.Sleep(sleep) } log.Infof("Starting server components") s.stop = make(chan bool) go s.Monitor() go s.api.Available(s.stop) go s.mach.PeriodicRefresh(machineStateRefreshInterval, s.stop) go s.agent.Heartbeat(s.stop) go s.aReconciler.Run(s.agent, s.stop) go s.engine.Run(s.engineReconcileInterval, s.stop) beatchan := make(chan *unit.UnitStateHeartbeat) go s.usGen.Run(beatchan, s.stop) go s.usPub.Run(beatchan, s.stop) }
func (eh *EventHandler) HandleEventJobBidSubmitted(ev event.Event) { jb := ev.Payload.(job.JobBid) err := eh.engine.ResolveJobOffer(jb.JobName, jb.MachineID) if err == nil { log.Infof("EventJobBidSubmitted(%s): successfully scheduled Job to Machine(%s)", jb.JobName, jb.MachineID) } else { log.Infof("EventJobBidSubmitted(%s): failed to schedule Job to Machine(%s)", jb.JobName, jb.MachineID) } }
func (eh *EventHandler) HandleCommandLoadJob(ev event.Event) { jobName := ev.Payload.(string) j, _ := eh.engine.registry.Job(jobName) if j == nil { log.Infof("CommandLoadJob(%s): asked to offer job that could not be found") return } log.Infof("CommandLoadJob(%s): publishing JobOffer", jobName) eh.engine.OfferJob(*j) }
func (eh *EventHandler) HandleEventJobScheduled(ev event.Event) { jobName := ev.Payload.(string) target := ev.Context.(string) if target != eh.agent.Machine.State().ID { log.Infof("EventJobScheduled(%s): Job scheduled to other Machine(%s), informing Agent", jobName, target) eh.agent.JobScheduledElsewhere(jobName) } else { log.Infof("EventJobScheduled(%s): Job scheduled here, informing Agent", jobName) eh.agent.JobScheduledLocally(jobName) } }
func getConfig(flagset *flag.FlagSet, userCfgFile string) (*config.Config, error) { opts := globalconf.Options{EnvPrefix: "FLEET_"} if userCfgFile != "" { // Fail hard if a user-provided config is not usable fi, err := os.Stat(userCfgFile) if err != nil { log.Fatalf("Unable to use config file %s: %v", userCfgFile, err) } if fi.IsDir() { log.Fatalf("Provided config %s is a directory, not a file", userCfgFile) } log.Infof("Using provided config file %s", userCfgFile) opts.Filename = userCfgFile } else if _, err := os.Stat(DefaultConfigFile); err == nil { log.Infof("Using default config file %s", DefaultConfigFile) opts.Filename = DefaultConfigFile } else { log.Infof("No provided or default config file found - proceeding without") } gconf, err := globalconf.NewWithOptions(&opts) if err != nil { return nil, err } gconf.ParseSet("", flagset) cfg := config.Config{ Verbosity: (*flagset.Lookup("verbosity")).Value.(flag.Getter).Get().(int), EtcdServers: (*flagset.Lookup("etcd_servers")).Value.(flag.Getter).Get().(stringSlice), EtcdKeyPrefix: (*flagset.Lookup("etcd_key_prefix")).Value.(flag.Getter).Get().(string), EtcdKeyFile: (*flagset.Lookup("etcd_keyfile")).Value.(flag.Getter).Get().(string), EtcdCertFile: (*flagset.Lookup("etcd_certfile")).Value.(flag.Getter).Get().(string), EtcdCAFile: (*flagset.Lookup("etcd_cafile")).Value.(flag.Getter).Get().(string), PublicIP: (*flagset.Lookup("public_ip")).Value.(flag.Getter).Get().(string), RawMetadata: (*flagset.Lookup("metadata")).Value.(flag.Getter).Get().(string), AgentTTL: (*flagset.Lookup("agent_ttl")).Value.(flag.Getter).Get().(string), VerifyUnits: (*flagset.Lookup("verify_units")).Value.(flag.Getter).Get().(bool), AuthorizedKeysFile: (*flagset.Lookup("authorized_keys_file")).Value.(flag.Getter).Get().(string), } if cfg.VerifyUnits { log.Warning("WARNING: The signed/verified units feature is DEPRECATED and should not be used. It will be completely removed from fleet and fleetctl.") } config.UpdateLoggingFlagsFromConfig(flag.CommandLine, &cfg) return &cfg, nil }
// JobScheduledElsewhere clears all state related to the indicated // job before bidding for all oustanding jobs that can be run locally. func (a *Agent) JobScheduledElsewhere(jobName string) { a.state.Lock() defer a.state.Unlock() log.Infof("Dropping offer and bid for Job(%s) from cache", jobName) a.state.PurgeOffer(jobName) log.Infof("Purging Job(%s) data from cache", jobName) a.state.PurgeJob(jobName) log.Infof("Checking outstanding job offers") a.bidForPossibleJobs() }
// ReportUnitState attaches the current state of the Agent's Machine to the given // unit.UnitState object, then persists that state in the Registry func (a *Agent) ReportUnitState(jobName string, us *unit.UnitState) { if us == nil { log.Infof("Job(%s): purging UnitState from Registry", jobName) err := a.registry.RemoveUnitState(jobName) if err != nil { log.Errorf("Failed to remove UnitState for job %s from Registry: %s", jobName, err.Error()) } } else { ms := a.Machine.State() us.MachineState = &ms log.Infof("Job(%s): pushing UnitState (loadState=%s, activeState=%s, subState=%s) to Registry", jobName, us.LoadState, us.ActiveState, us.SubState) a.registry.SaveUnitState(jobName, us) } }
// Resolve attempts to yield a result from the configured action and endpoint. If a usable // Result or error was not attained, nil values are returned. func (ar *actionResolver) Resolve(cancel <-chan bool) (*Result, error) { resp, body, err := ar.exhaust(cancel) if err != nil { log.Infof("Failed getting response from %v: %v", ar.endpoint, err) return nil, nil } hdlr, ok := handlers[resp.StatusCode] if !ok { log.Infof("Response %s from %v unusable", resp.Status, ar.endpoint) return nil, nil } return hdlr(resp, body) }
// JobUnscheduled attempts to unload the indicated job only // if it were scheduled here in the first place, otherwise // the event is ignored. If unloading is necessary, all jobs // that can be run locally will also be bid upon. func (a *Agent) JobUnscheduled(jobName string) { a.state.Lock() defer a.state.Unlock() if !a.state.ScheduledHere(jobName) { log.V(1).Infof("Job(%s) not scheduled here, ignoring", jobName) return } log.Infof("Unloading Job(%s)", jobName) a.unloadJob(jobName) log.Infof("Checking outstanding JobOffers") a.bidForPossibleJobs() }
func (e *Engine) ResolveJobOffer(jobName string, machID string) error { log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName) mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock JobOffer(%s)", jobName) return errors.New("could not lock JobOffer") } defer mutex.Unlock() log.V(1).Infof("Claimed JobOffer(%s)", jobName) err := e.registry.ResolveJobOffer(jobName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err) return err } err = e.registry.ScheduleJob(jobName, machID) if err != nil { log.Errorf("Failed scheduling Job(%s): %v", jobName, err) return err } log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID) return nil }
func (e *Engine) OfferJob(j job.Job) error { log.V(1).Infof("Attempting to lock Job(%s)", j.Name) mutex := e.registry.LockJob(j.Name, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock Job(%s)", j.Name) return errors.New("could not lock Job") } defer mutex.Unlock() log.V(1).Infof("Claimed Job(%s)", j.Name) machineIDs, err := e.partitionCluster(&j) if err != nil { log.Errorf("failed partitioning cluster for Job(%s): %v", j.Name, err) return err } offer := job.NewOfferFromJob(j, machineIDs) err = e.registry.CreateJobOffer(offer) if err == nil { log.Infof("Published JobOffer(%s)", offer.Job.Name) } return err }
func (m *SystemdUnitManager) stopUnit(name string) { if stat, err := m.systemd.StopUnit(name, "replace"); err != nil { log.Errorf("Failed to stop systemd unit %s: %v", name, err) } else { log.Infof("Stopped systemd unit %s(%s)", name, stat) } }
func (r *EtcdRegistry) getJobFromModel(jm jobModel) *job.Job { var err error var unit *unit.Unit // New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry if !jm.UnitHash.Empty() { unit = r.getUnitByHash(jm.UnitHash) if unit == nil { log.Warningf("No Unit found in Registry for Job(%s)", jm.Name) return nil } if unit.Hash() != jm.UnitHash { log.Errorf("Unit Hash %s does not match expected %s for Job(%s)!", unit.Hash(), jm.UnitHash, jm.Name) return nil } } else { // Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry unit, err = r.getUnitFromLegacyPayload(jm.Name) if err != nil { log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name) return nil } else if unit == nil { log.Warningf("No Payload found in Registry for Job(%s)", jm.Name) return nil } log.Infof("Migrating legacy Payload(%s)", jm.Name) if err := r.storeOrGetUnit(*unit); err != nil { log.Warningf("Unable to migrate legacy Payload: %v", err) } } return job.NewJob(jm.Name, *unit) }
// Submit a bid for the given Job func (a *Agent) bid(jobName string) { log.Infof("Submitting JobBid for Job(%s)", jobName) jb := job.NewBid(jobName, a.Machine.State().ID) a.registry.SubmitJobBid(jb) a.state.TrackBid(jb.JobName) }
// MaybeBid bids for the given JobOffer only if the Agent determines that it is able // to run the JobOffer's Job func (a *Agent) MaybeBid(jo job.JobOffer) { a.state.Lock() defer a.state.Unlock() // Everything we check against could change over time, so we track all // offers starting here for future bidding even if we can't bid now a.state.TrackOffer(jo) a.state.TrackJob(&jo.Job) if !a.ableToRun(&jo.Job) { log.Infof("EventJobOffered(%s): not all criteria met, not bidding", jo.Job.Name) return } log.Infof("EventJobOffered(%s): passed all criteria, submitting JobBid", jo.Job.Name) a.bid(jo.Job.Name) }
func (m *SystemdUnitManager) removeUnit(name string) { log.Infof("Removing systemd unit %s", name) m.systemd.DisableUnitFiles([]string{name}, true) ufPath := m.getUnitFilePath(name) os.Remove(ufPath) }
func (r *EtcdRegistry) getJobFromObjectNode(node *etcd.Node) (*job.Job, error) { var err error var jm jobModel if err = unmarshal(node.Value, &jm); err != nil { return nil, err } var unit *unit.Unit // New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry if !jm.UnitHash.Empty() { unit = r.getUnitByHash(jm.UnitHash) if unit == nil { log.Warningf("No Unit found in Registry for Job(%s)", jm.Name) return nil, nil } if unit.Hash() != jm.UnitHash { log.Errorf("Unit Hash %s does not match expected %s for Job(%s)!", unit.Hash(), jm.UnitHash, jm.Name) return nil, nil } } else { // Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry unit, err = r.getUnitFromLegacyPayload(jm.Name) if err != nil { log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name) return nil, nil } else if unit == nil { log.Warningf("No Payload found in Registry for Job(%s)", jm.Name) return nil, nil } log.Infof("Migrating legacy Payload(%s)", jm.Name) if err := r.storeOrGetUnit(*unit); err != nil { log.Warningf("Unable to migrate legacy Payload: %v", err) } jm.UnitHash = unit.Hash() log.Infof("Updating Job(%s) with legacy payload Hash(%s)", jm.Name, jm.UnitHash) if err := r.updateJobObjectNode(&jm, node.ModifiedIndex); err != nil { log.Warningf("Unable to update Job(%s) with legacy payload Hash(%s): %v", jm.Name, jm.UnitHash, err) } } return job.NewJob(jm.Name, *unit), nil }
// ableToRun determines if the Agent can run the provided Job, and returns a boolean indicating // whether this is the case. There are five criteria for an Agent to be eligible to run a Job: // - Job must pass signature verification // - agent must have all of the Job's required metadata (if any) // - agent must meet the Job's machine target requirement (if any) // - agent must have all required Peers of the Job scheduled locally (if any) // - Job must not conflict with any other Jobs scheduled to the agent func (a *Agent) ableToRun(j *job.Job) bool { if !a.verifyJobSignature(j) { log.V(1).Infof("Failed to verify Job(%s)", j.Name) return false } log.Infof("Job(%s) has requirements: %s", j.Name, j.Requirements()) metadata := j.RequiredTargetMetadata() if len(metadata) == 0 { log.V(1).Infof("Job(%s) has no required machine metadata", j.Name) } else { log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata) ms := a.Machine.State() if !machine.HasMetadata(&ms, metadata) { log.Infof("Unable to run Job(%s): local Machine metadata insufficient", j.Name) return false } } if tgt, ok := j.RequiredTarget(); ok && !a.Machine.State().MatchID(tgt) { log.Infof("Unable to run Job(%s): agent does not meet machine target requirement (%s)", j.Name, tgt) return false } peers := j.Peers() if len(peers) == 0 { log.V(1).Infof("Job(%s) has no required peers", j.Name) } else { log.V(1).Infof("Job(%s) requires peers: %v", j.Name, peers) for _, peer := range peers { if !a.peerScheduledHere(j.Name, peer) { log.Infof("Unable to run Job(%s): required Peer(%s) is not scheduled locally", j.Name, peer) return false } } } if conflicted, conflictedJobName := a.HasConflict(j.Name, j.Conflicts()); conflicted { log.Infof("Unable to run Job(%s): conflict with Job(%s)", j.Name, conflictedJobName) return false } return true }
func (e *Engine) resolveJobOffer(jName string) (err error) { err = e.registry.ResolveJobOffer(jName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jName, err) } else { log.Infof("Resolved JobOffer(%s)", jName) } return }
func (e *Engine) unscheduleJob(jName, machID string) (err error) { err = e.registry.ClearJobTarget(jName, machID) if err != nil { log.Errorf("Failed clearing target Machine(%s) of Job(%s): %v", machID, jName, err) } else { log.Infof("Unscheduled Job(%s) from Machine(%s)", jName, machID) } return }
func (eh *EventHandler) HandleCommandUnloadJob(ev event.Event) { jobName := ev.Payload.(string) target := ev.Context.(string) if target != "" { log.Infof("CommandUnloadJob(%s): clearing scheduling decision", jobName) eh.engine.registry.ClearJobTarget(jobName, target) } }
func (e *Engine) offerJob(j *job.Job) (err error) { offer := job.NewOfferFromJob(*j) err = e.registry.CreateJobOffer(offer) if err != nil { log.Errorf("Failed publishing JobOffer(%s): %v", j.Name, err) } else { log.Infof("Published JobOffer(%s)", j.Name) } return }
func (eh *EventHandler) HandleEventJobOffered(ev event.Event) { jo := ev.Payload.(job.JobOffer) if !jo.OfferedTo(eh.agent.Machine.State().ID) { log.V(1).Infof("EventJobOffered(%s): not offered to this machine, ignoring", jo.Job.Name) return } log.Infof("EventJobOffered(%s): deciding whether to bid or not", jo.Job.Name) eh.agent.MaybeBid(jo) }
func (eh *EventHandler) HandleEventUnitStateUpdated(ev event.Event) { jobName := ev.Context.(string) state := ev.Payload.(*unit.UnitState) if state == nil { log.V(1).Infof("EventUnitStateUpdated(%s): received nil UnitState object, ignoring", jobName) return } log.Infof("EventUnitStateUpdated(%s): reporting state to Registry", jobName) eh.agent.ReportUnitState(jobName, state) }
func (eh *EventHandler) HandleCommandStopJob(ev event.Event) { jobName := ev.Payload.(string) target := ev.Context.(string) if target != eh.agent.Machine.State().ID { log.V(1).Infof("CommandStopJob(%s): scheduled elsewhere, ignoring", jobName) return } log.Infof("CommandStopJob(%s): instructing Agent to stop Job", jobName) eh.agent.StopJob(jobName) }
func (m *SystemdUnitManager) writeUnit(name string, contents string) error { log.Infof("Writing systemd unit %s", name) ufPath := m.getUnitFilePath(name) err := ioutil.WriteFile(ufPath, []byte(contents), os.FileMode(0644)) if err != nil { return err } _, err = m.systemd.LinkUnitFiles([]string{ufPath}, true, true) return err }
// JobScheduledLocally clears all state related to the indicated // job's offers/bids before attempting to load and possibly start // the job. The ability to run the job will be revalidated before // loading, and unscheduled if such validation fails. func (a *Agent) JobScheduledLocally(jobName string) { a.state.Lock() defer a.state.Unlock() log.Infof("Dropping offer and bid for Job(%s) from cache", jobName) a.state.PurgeOffer(jobName) j, err := a.registry.Job(jobName) if err != nil { log.Errorf("Failed fetching Job(%s) from Registry: %v", jobName, err) return } if j == nil { log.Errorf("Unable to find Job(%s) in Registry", jobName) return } if !a.ableToRun(j) { log.Infof("Unable to run locally-scheduled Job(%s), unscheduling", jobName) a.registry.ClearJobTarget(jobName, a.Machine.State().ID) a.state.PurgeJob(jobName) return } a.loadJob(j) log.Infof("Bidding for all possible peers of Job(%s)", j.Name) a.bidForPossiblePeers(j.Name) if j.TargetState == nil || *j.TargetState != job.JobStateLaunched { return } log.Infof("Job(%s) loaded, now starting it", j.Name) a.startJobUnlocked(j.Name) }
// Jobs lists all Jobs known by the Registry, ordered by job name func (r *EtcdRegistry) Jobs() ([]job.Job, error) { var jobs []job.Job req := etcd.Get{ Key: path.Join(r.keyPrefix, jobPrefix), Sorted: true, Recursive: true, } resp, err := r.etcd.Do(&req) if err != nil { if isKeyNotFound(err) { err = nil } return jobs, err } for _, dir := range resp.Node.Nodes { objKey := path.Join(dir.Key, "object") var obj *etcd.Node for _, node := range dir.Nodes { if node.Key != objKey { continue } node := node obj = &node } if obj == nil { continue } j, err := r.getJobFromObjectNode(obj) if j == nil || err != nil { log.Infof("Unable to parse Job in Registry at key %s", obj.Key) continue } if err = r.parseJobDir(j, &dir); err != nil { log.Errorf("Failed to parse Job(%s) model: %v", j.Name, err) continue } jobs = append(jobs, *j) } return jobs, nil }