func (self *Engine) OfferJob(j job.Job) error { log.V(2).Infof("Attempting to lock Job(%s)", j.Name) mutex := self.lockJob(j.Name) if mutex == nil { log.V(1).Infof("Could not lock Job(%s)", j.Name) return errors.New("Could not lock Job") } defer mutex.Unlock() log.V(1).Infof("Claimed Job", j.Name) machineBootIds, err := self.partitionCluster(&j) if err != nil { log.Errorf("Failed partitioning cluster for Job(%s): %v", j.Name, err) return err } offer := job.NewOfferFromJob(j, machineBootIds) log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name) self.registry.CreateJobOffer(offer) log.Infof("Published JobOffer(%s)", offer.Job.Name) return nil }
func (e *Engine) OfferJob(j job.Job) error { log.V(1).Infof("Attempting to lock Job(%s)", j.Name) mutex := e.registry.LockJob(j.Name, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock Job(%s)", j.Name) return errors.New("Could not lock Job") } defer mutex.Unlock() log.V(1).Infof("Claimed Job(%s)", j.Name) machineIDs, err := e.partitionCluster(&j) if err != nil { log.Errorf("Failed partitioning cluster for Job(%s): %v", j.Name, err) return err } offer := job.NewOfferFromJob(j, machineIDs) err = e.registry.CreateJobOffer(offer) if err == nil { log.Infof("Published JobOffer(%s)", offer.Job.Name) } return err }
// ParseFilepath expands ~ and ~user constructions. // If user or $HOME is unknown, do nothing. func ParseFilepath(path string) string { if !strings.HasPrefix(path, "~") { return path } i := strings.Index(path, "/") if i < 0 { i = len(path) } var home string if i == 1 { if home = os.Getenv("HOME"); home == "" { usr, err := user.Current() if err != nil { log.V(1).Infof("Failed to get current home directory: %v", err) return path } home = usr.HomeDir } } else { usr, err := user.Lookup(path[1:i]) if err != nil { log.V(1).Infof("Failed to get %v's home directory: %v", path[1:i], err) return path } home = usr.HomeDir } path = filepath.Join(home, path[i:]) return path }
func runUnloadUnit(args []string) (exit int) { jobs, err := findJobs(args) if err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) return 1 } wait := make([]string, 0) for _, j := range jobs { if j.State == nil { fmt.Fprintf(os.Stderr, "Unable to determine state of %q\n", *(j.State)) return 1 } if *(j.State) == job.JobStateInactive { log.V(1).Infof("Job(%s) already %s, skipping.", j.Name, job.JobStateInactive) continue } log.V(1).Infof("Unloading Job(%s)", j.Name) registryCtl.SetJobTargetState(j.Name, job.JobStateInactive) wait = append(wait, j.Name) } if !sharedFlags.NoBlock { errchan := waitForJobStates(wait, job.JobStateInactive, sharedFlags.BlockAttempts, os.Stdout) for err := range errchan { fmt.Fprintf(os.Stderr, "%v\n", err) exit = 1 } } return }
func (self *Engine) ResolveJobOffer(jobName string, machBootId string) error { log.V(2).Infof("Attempting to lock JobOffer(%s)", jobName) mutex := self.lockJobOffer(jobName) if mutex == nil { log.V(2).Infof("Could not lock JobOffer(%s)", jobName) return errors.New("Could not lock JobOffer") } defer mutex.Unlock() log.V(2).Infof("Claimed JobOffer(%s)", jobName) log.V(2).Infof("Resolving JobOffer(%s), scheduling to Machine(%s)", jobName, machBootId) err := self.registry.ResolveJobOffer(jobName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err) return err } err = self.registry.ScheduleJob(jobName, machBootId) if err != nil { log.Errorf("Failed scheduling Job(%s): %v", jobName, err) return err } log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machBootId) return nil }
// HasMetadata determine if a Machine fulfills the given requirements // based on its current state. func (m *Machine) HasMetadata(metadata map[string][]string) bool { state := m.State() for key, values := range metadata { local, ok := state.Metadata[key] if !ok { log.V(1).Infof("No local values found for Metadata(%s)", key) return false } log.V(2).Infof("Asserting local Metadata(%s) meets requirements", key) var localMatch bool for _, val := range values { if local == val { log.V(1).Infof("Local Metadata(%s) meets requirement", key) localMatch = true } } if !localMatch { log.V(1).Infof("Local Metadata(%s) does not match requirement", key) return false } } return true }
func (e *Engine) ResolveJobOffer(jobName string, machID string) error { log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName) mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock JobOffer(%s)", jobName) return errors.New("Could not lock JobOffer") } defer mutex.Unlock() log.V(1).Infof("Claimed JobOffer(%s)", jobName) err := e.registry.ResolveJobOffer(jobName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err) return err } err = e.registry.ScheduleJob(jobName, machID) if err != nil { log.Errorf("Failed scheduling Job(%s): %v", jobName, err) return err } log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID) return nil }
// addrToHostPort takes the given address and parses it into a string suitable // for use in the 'hostnames' field in a known_hosts file. For more details, // see the `SSH_KNOWN_HOSTS FILE FORMAT` section of `man 8 sshd` func (kc *HostKeyChecker) addrToHostPort(a string) (string, error) { if !strings.Contains(a, ":") { // No port, so return unadulterated return a, nil } host, p, err := net.SplitHostPort(a) if err != nil { log.V(1).Infof("Unable to parse addr %s: %v", a, err) return "", err } port, err := strconv.Atoi(p) if err != nil { log.V(1).Infof("Error parsing port %s: %v", p, err) return "", err } // Default port should be omitted from the entry. // (see `put_host_port` in openssh/misc.c) if port == 0 || port == sshDefaultPort { // IPv6 addresses must be enclosed in square brackets if strings.Contains(host, ":") { host = fmt.Sprintf("[%s]", host) } return host, nil } return fmt.Sprintf("[%s]:%d", host, port), nil }
// Pull a Job and its payload from the Registry func (a *Agent) FetchJob(jobName string) *job.Job { log.V(1).Infof("Fetching Job(%s) from Registry", jobName) j := a.registry.GetJob(jobName) if j == nil { log.V(1).Infof("Job not found in Registry") } return j }
// Determine if the Agent can run the provided Job func (a *Agent) AbleToRun(j *job.Job) bool { if !a.VerifyJob(j) { log.V(1).Infof("Failed to verify Job(%s)", j.Name) return false } requirements := j.Requirements() if len(requirements) == 0 { log.V(1).Infof("Job(%s) has no requirements", j.Name) return true } if log.V(1) { var reqString string for key, slice := range requirements { reqString += fmt.Sprintf("%s = [", key) for _, val := range slice { reqString += fmt.Sprintf("%s, ", val) } reqString += fmt.Sprint("] ") } log.Infof("Job(%s) has requirements: %s", j.Name, reqString) } metadata := extractMachineMetadata(requirements) log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata) if !a.machine.HasMetadata(metadata) { log.V(1).Infof("Unable to run Job(%s), local Machine metadata insufficient", j.Name) return false } bootID, ok := requirements[unit.FleetXConditionMachineBootID] if ok && len(bootID) > 0 && !a.machine.State().MatchBootID(bootID[0]) { log.V(1).Infof("Agent does not pass MachineBootID condition for Job(%s)", j.Name) return false } peers := j.Payload.Peers() if len(peers) > 0 { log.V(1).Infof("Asserting required Peers %v of Job(%s) are scheduled locally", peers, j.Name) for _, peer := range peers { if !a.peerScheduledHere(j.Name, peer) { log.V(1).Infof("Required Peer(%s) of Job(%s) is not scheduled locally", peer, j.Name) return false } } } else { log.V(2).Infof("Job(%s) has no peers to worry about", j.Name) } if conflicted, conflictedJobName := a.state.HasConflict(j.Name, j.Payload.Conflicts()); conflicted { log.V(1).Infof("Job(%s) has conflict with Job(%s)", j.Name, conflictedJobName) return false } return true }
// Inform the Registry that a Job must be rescheduled func (a *Agent) RescheduleJob(j *job.Job) { log.V(2).Infof("Stopping Job(%s)", j.Name) a.registry.UnscheduleJob(j.Name) offer := job.NewOfferFromJob(*j) log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name) a.registry.CreateJobOffer(offer) log.Infof("Published JobOffer(%s)", offer.Job.Name) }
func (m *SystemdManager) startUnit(name string) { log.V(1).Infof("Starting systemd unit %s", name) files := []string{name} m.Systemd.EnableUnitFiles(files, true, false) log.V(1).Infof("Enabled systemd unit %s", name) m.Systemd.StartUnit(name, "replace") log.Infof("Started systemd unit %s", name) }
// Inform the Registry that a Job must be rescheduled func (a *Agent) RescheduleJob(j *job.Job) { log.V(2).Infof("Stopping Job(%s)", j.Name) a.registry.UnscheduleJob(j.Name) // TODO(uwedeportivo): agent placing offer ? offer := job.NewOfferFromJob(*j, nil) log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name) a.registry.CreateJobOffer(offer) log.Infof("Published JobOffer(%s)", offer.Job.Name) }
// Instruct the Agent that an offer has been created and must // be tracked until it is resolved func (a *Agent) TrackOffer(jo job.JobOffer) { a.state.Lock() defer a.state.Unlock() log.V(2).Infof("Tracking JobOffer(%s)", jo.Job.Name) a.state.TrackOffer(jo) peers := jo.Job.Payload.Peers() log.V(2).Infof("Tracking peers of JobOffer(%s): %v", jo.Job.Name, peers) a.state.TrackJobPeers(jo.Job.Name, jo.Job.Payload.Peers()) }
func (self *EventHandler) HandleEventJobBidSubmitted(ev event.Event) { jb := ev.Payload.(job.JobBid) log.V(1).Infof("EventJobBidSubmitted(%s): attempting to schedule Job to Machine(%s)", jb.JobName, jb.MachineBootId) err := self.engine.ResolveJobOffer(jb.JobName, jb.MachineBootId) if err == nil { log.V(1).Infof("EventJobBidSubmitted(%s): successfully scheduled Job to Machine(%s)", jb.JobName, jb.MachineBootId) } else { log.V(1).Infof("EventJobBidSubmitted(%s): failed to schedule Job to Machine(%s)", jb.JobName, jb.MachineBootId) } }
// Determine if all necessary peers of a Job are scheduled to this Agent func (a *Agent) peerScheduledHere(jobName, peerName string) bool { log.V(1).Infof("Looking for target of Peer(%s)", peerName) //FIXME: ideally the machine would use its own knowledge rather than calling GetJobTarget if tgt, _ := a.registry.GetJobTarget(peerName); tgt == "" || tgt != a.Machine.State().ID { log.V(1).Infof("Peer(%s) of Job(%s) not scheduled here", peerName, jobName) return false } log.V(1).Infof("Peer(%s) of Job(%s) scheduled here", peerName, jobName) return true }
// Distribute an Event to all listeners registered to Event.Type func (eb *EventBus) dispatch(ev *Event) { log.V(1).Infof("Dispatching %s to listeners", ev.Type) handlerFuncName := fmt.Sprintf("Handle%s", ev.Type) for name, listener := range eb.listeners { log.V(1).Infof("Looking for event handler func %s on listener %s", handlerFuncName, name) handlerFunc := reflect.ValueOf(listener).MethodByName(handlerFuncName) if handlerFunc.IsValid() { log.V(1).Infof("Calling event handler for %s on listener %s", ev.Type, name) go handlerFunc.Call([]reflect.Value{reflect.ValueOf(*ev)}) } } }
// Periodically report to the Registry at an interval equal to // half of the provided ttl. Stop reporting when the provided // channel is closed. Failed attempts to report state to the // Registry are retried twice before moving on to the next // reporting interval. func (a *Agent) Heartbeat(ttl time.Duration, stop chan bool) { attempt := func(attempts int, f func() error) (err error) { if attempts < 1 { return fmt.Errorf("attempts argument must be 1 or greater, got %d", attempts) } // The amount of time the retry mechanism waits after a failed attempt // doubles following each failure. This is a simple exponential backoff. sleep := time.Second for i := 1; i <= attempts; i++ { err = f() if err == nil || i == attempts { break } sleep = sleep * 2 log.V(2).Infof("function returned err, retrying in %v: %v", sleep, err) time.Sleep(sleep) } return err } heartbeat := func() error { return a.registry.SetMachineState(a.machine.State(), ttl) } // Explicitly heartbeat immediately to push state to the // Registry as quickly as possible a.machine.RefreshState() if err := attempt(3, heartbeat); err != nil { log.Errorf("Failed heartbeat after 3 attempts: %v", err) } interval := ttl / refreshInterval ticker := time.Tick(interval) for { select { case <-stop: log.V(2).Info("MachineHeartbeat exiting due to stop signal") return case <-ticker: log.V(2).Info("MachineHeartbeat tick") a.machine.RefreshState() if err := attempt(3, heartbeat); err != nil { log.Errorf("Failed heartbeat after 3 attempts: %v", err) } } } }
// bidForPossiblePeers submits bids for all known peers of the provided job that can // be run locally func (a *Agent) bidForPossiblePeers(jobName string) { peers := a.state.GetJobsByPeer(jobName) for _, peer := range peers { log.V(1).Infof("Found unresolved offer for Peer(%s) of Job(%s)", peer, jobName) peerJob := a.fetchJob(peer) if peerJob != nil && a.ableToRun(peerJob) { a.bid(peer) } else { log.V(1).Infof("Unable to bid for Peer(%s) of Job(%s)", peer, jobName) } } }
// ReportUnitState attaches the current state of the Agent's Machine to the given // unit.UnitState object, then persists that state in the Registry func (a *Agent) ReportUnitState(jobName string, us *unit.UnitState) { if us == nil { log.V(1).Infof("Job(%s): purging UnitState from Registry", jobName) err := a.registry.RemoveUnitState(jobName) if err != nil { log.Errorf("Failed to remove UnitState for job %s from Registry: %s", jobName, err.Error()) } } else { ms := a.Machine.State() us.MachineState = &ms log.V(1).Infof("Job(%s): pushing UnitState (loadState=%s, activeState=%s, subState=%s) to Registry", jobName, us.LoadState, us.ActiveState, us.SubState) a.registry.SaveUnitState(jobName, us) } }
func (eh *EventHandler) HandleEventJobUpdated(ev event.Event) { j := ev.Payload.(job.Job) localBootId := eh.agent.Machine().State().BootId targetBootId := ev.Context.(string) if targetBootId != localBootId { log.V(1).Infof("EventJobUpdated(%s): Job not scheduled to Agent %s, skipping", j.Name, localBootId) return } log.V(1).Infof("EventJobUpdated(%s): Starting Job", j.Name) eh.agent.StartJob(&j) }
// bidForPossibleJobs submits bids for all unresolved offers whose Jobs // can be run locally func (a *Agent) bidForPossibleJobs() { offers := a.state.GetOffersWithoutBids() log.V(1).Infof("Checking %d unbade offers", len(offers)) for i := range offers { offer := offers[i] log.V(1).Infof("Checking ability to run Job(%s)", offer.Job.Name) if a.ableToRun(&offer.Job) { log.V(1).Infof("Able to run Job(%s), submitting bid", offer.Job.Name) a.bid(offer.Job.Name) } else { log.V(1).Infof("Still unable to run Job(%s)", offer.Job.Name) } } }
func (eh *EventHandler) HandleEventJobOffered(ev event.Event) { jo := ev.Payload.(job.JobOffer) log.V(1).Infof("EventJobOffered(%s): verifying ability to run Job", jo.Job.Name) // Everything we check against could change over time, so we track all // offers starting here for future bidding even if we can't bid now eh.agent.TrackOffer(jo) if eh.agent.AbleToRun(&jo.Job) { log.Infof("EventJobOffered(%s): passed all criteria, submitting JobBid", jo.Job.Name) eh.agent.Bid(jo.Job.Name) } else { log.V(1).Infof("EventJobOffered(%s): not all criteria met, not bidding", jo.Job.Name) } }
// lockResource will attempt to lock a mutex on a resource defined by the // provided class and id. The context will be persisted to the Registry to // track by whom the mutex is currently locked. func (r *Registry) lockResource(class, id, context string) *TimedResourceMutex { mutexName := fmt.Sprintf("%s-%s", class, id) log.V(2).Infof("Attempting to acquire mutex on %s", mutexName) key := path.Join(keyPrefix, mutexPrefix, mutexName) resp, err := r.etcd.Create(key, context, uint64(ResourceMutexTTL)) if err != nil { log.V(2).Infof("Failed to acquire mutex on %s", mutexName) return nil } log.V(2).Infof("Successfully acquired mutex on %s", mutexName) return &TimedResourceMutex{r.etcd, *resp.Node} }
func globMatches(pattern, target string) bool { matched, err := path.Match(pattern, target) if err != nil { log.V(2).Infof("Received error while matching pattern '%s': %v", pattern, err) } return matched }
// verifyJob attempts to verify the integrity of the given Job by checking the // signature against a SignatureSet stored in its repository. func (a *Agent) verifyJob(j *job.Job) bool { if a.verifier == nil { return true } ss, _ := a.registry.GetSignatureSetOfJob(j.Name) ok, err := a.verifier.VerifyJob(j, ss) if err != nil { log.V(1).Infof("Error verifying signature of Job(%s): %v", j.Name, err) return false } else if !ok { log.V(1).Infof("Job(%s) does not match signature", j.Name) return false } return true }
// New creates a new Machine object. The provided parameters will override // those that might be dynamically generated by the Machine on the fly. func New(bootId string, publicIP string, metadata map[string]string) *Machine { static := MachineState{bootId, publicIP, metadata} log.V(2).Infof("Created Machine with static state %s", static) m := &Machine{staticState: static} m.RefreshState() return m }
func (eh *EventHandler) HandleEventMachineRemoved(ev event.Event) { machID := ev.Payload.(string) mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID) if mutex == nil { log.V(1).Infof("EventMachineRemoved(%s): failed to lock Machine, ignoring event", machID) return } defer mutex.Unlock() jobs := getJobsScheduledToMachine(eh.engine.registry, machID) for _, j := range jobs { log.Infof("EventMachineRemoved(%s): clearing UnitState(%s)", machID, j.Name) err := eh.engine.registry.RemoveUnitState(j.Name) if err != nil { log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err) } log.Infof("EventMachineRemoved(%s): unscheduling Job(%s)", machID, j.Name) eh.engine.registry.ClearJobTarget(j.Name, machID) } for _, j := range jobs { log.Infof("EventMachineRemoved(%s): re-publishing JobOffer(%s)", machID, j.Name) eh.engine.OfferJob(j) } eh.engine.clust.machineRemoved(machID) }
func (eh *EventHandler) HandleEventJobStateUpdated(ev event.Event) { jobName := ev.Context.(string) state := ev.Payload.(*job.JobState) if state == nil { log.V(1).Infof("EventJobStateUpdated(%s): received nil JobState object", jobName) } else { log.V(1).Infof("EventJobStateUpdated(%s): pushing state (loadState=%s, activeState=%s, subState=%s) to Registry", jobName, state.LoadState, state.ActiveState, state.SubState) // FIXME: This should probably be set in the underlying event-generation code ms := eh.agent.Machine().State() state.MachineState = &ms } eh.agent.ReportJobState(jobName, state) }
func lazyCreateJobs(args []string, signAndVerify bool) error { for _, arg := range args { jobName := unitNameMangle(arg) if j := registryCtl.GetJob(jobName); j != nil { log.V(1).Infof("Found Job(%s) in Registry, no need to recreate it", jobName) if signAndVerify { if err := verifyJob(j); err != nil { return err } } continue } unit, err := getUnitFromFile(arg) if err != nil { return fmt.Errorf("Failed getting Unit(%s) from file: %v", jobName, err) } j, err := createJob(jobName, unit) if err != nil { return err } if signAndVerify { if err := signJob(j); err != nil { return err } } } return nil }