Beispiel #1
0
func (self *Engine) OfferJob(j job.Job) error {
	log.V(2).Infof("Attempting to lock Job(%s)", j.Name)

	mutex := self.lockJob(j.Name)
	if mutex == nil {
		log.V(1).Infof("Could not lock Job(%s)", j.Name)
		return errors.New("Could not lock Job")
	}
	defer mutex.Unlock()

	log.V(1).Infof("Claimed Job", j.Name)

	machineBootIds, err := self.partitionCluster(&j)
	if err != nil {
		log.Errorf("Failed partitioning cluster for Job(%s): %v", j.Name, err)
		return err
	}

	offer := job.NewOfferFromJob(j, machineBootIds)

	log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name)
	self.registry.CreateJobOffer(offer)
	log.Infof("Published JobOffer(%s)", offer.Job.Name)

	return nil
}
Beispiel #2
0
func (e *Engine) OfferJob(j job.Job) error {
	log.V(1).Infof("Attempting to lock Job(%s)", j.Name)

	mutex := e.registry.LockJob(j.Name, e.machine.State().ID)
	if mutex == nil {
		log.V(1).Infof("Could not lock Job(%s)", j.Name)
		return errors.New("Could not lock Job")
	}
	defer mutex.Unlock()

	log.V(1).Infof("Claimed Job(%s)", j.Name)

	machineIDs, err := e.partitionCluster(&j)
	if err != nil {
		log.Errorf("Failed partitioning cluster for Job(%s): %v", j.Name, err)
		return err
	}

	offer := job.NewOfferFromJob(j, machineIDs)

	err = e.registry.CreateJobOffer(offer)
	if err == nil {
		log.Infof("Published JobOffer(%s)", offer.Job.Name)
	}

	return err
}
Beispiel #3
0
// ParseFilepath expands ~ and ~user constructions.
// If user or $HOME is unknown, do nothing.
func ParseFilepath(path string) string {
	if !strings.HasPrefix(path, "~") {
		return path
	}
	i := strings.Index(path, "/")
	if i < 0 {
		i = len(path)
	}
	var home string
	if i == 1 {
		if home = os.Getenv("HOME"); home == "" {
			usr, err := user.Current()
			if err != nil {
				log.V(1).Infof("Failed to get current home directory: %v", err)
				return path
			}
			home = usr.HomeDir
		}
	} else {
		usr, err := user.Lookup(path[1:i])
		if err != nil {
			log.V(1).Infof("Failed to get %v's home directory: %v", path[1:i], err)
			return path
		}
		home = usr.HomeDir
	}
	path = filepath.Join(home, path[i:])
	return path
}
Beispiel #4
0
func runUnloadUnit(args []string) (exit int) {
	jobs, err := findJobs(args)
	if err != nil {
		fmt.Fprintf(os.Stderr, "%v\n", err)
		return 1
	}

	wait := make([]string, 0)
	for _, j := range jobs {
		if j.State == nil {
			fmt.Fprintf(os.Stderr, "Unable to determine state of %q\n", *(j.State))
			return 1
		}

		if *(j.State) == job.JobStateInactive {
			log.V(1).Infof("Job(%s) already %s, skipping.", j.Name, job.JobStateInactive)
			continue
		}

		log.V(1).Infof("Unloading Job(%s)", j.Name)
		registryCtl.SetJobTargetState(j.Name, job.JobStateInactive)
		wait = append(wait, j.Name)
	}

	if !sharedFlags.NoBlock {
		errchan := waitForJobStates(wait, job.JobStateInactive, sharedFlags.BlockAttempts, os.Stdout)
		for err := range errchan {
			fmt.Fprintf(os.Stderr, "%v\n", err)
			exit = 1
		}
	}

	return
}
Beispiel #5
0
func (self *Engine) ResolveJobOffer(jobName string, machBootId string) error {
	log.V(2).Infof("Attempting to lock JobOffer(%s)", jobName)
	mutex := self.lockJobOffer(jobName)

	if mutex == nil {
		log.V(2).Infof("Could not lock JobOffer(%s)", jobName)
		return errors.New("Could not lock JobOffer")
	}
	defer mutex.Unlock()

	log.V(2).Infof("Claimed JobOffer(%s)", jobName)

	log.V(2).Infof("Resolving JobOffer(%s), scheduling to Machine(%s)", jobName, machBootId)
	err := self.registry.ResolveJobOffer(jobName)
	if err != nil {
		log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err)
		return err
	}

	err = self.registry.ScheduleJob(jobName, machBootId)
	if err != nil {
		log.Errorf("Failed scheduling Job(%s): %v", jobName, err)
		return err
	}

	log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machBootId)
	return nil
}
Beispiel #6
0
// HasMetadata determine if a Machine fulfills the given requirements
// based on its current state.
func (m *Machine) HasMetadata(metadata map[string][]string) bool {
	state := m.State()

	for key, values := range metadata {
		local, ok := state.Metadata[key]
		if !ok {
			log.V(1).Infof("No local values found for Metadata(%s)", key)
			return false
		}

		log.V(2).Infof("Asserting local Metadata(%s) meets requirements", key)

		var localMatch bool
		for _, val := range values {
			if local == val {
				log.V(1).Infof("Local Metadata(%s) meets requirement", key)
				localMatch = true
			}
		}

		if !localMatch {
			log.V(1).Infof("Local Metadata(%s) does not match requirement", key)
			return false
		}
	}

	return true
}
Beispiel #7
0
func (e *Engine) ResolveJobOffer(jobName string, machID string) error {
	log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName)
	mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID)

	if mutex == nil {
		log.V(1).Infof("Could not lock JobOffer(%s)", jobName)
		return errors.New("Could not lock JobOffer")
	}
	defer mutex.Unlock()

	log.V(1).Infof("Claimed JobOffer(%s)", jobName)

	err := e.registry.ResolveJobOffer(jobName)
	if err != nil {
		log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err)
		return err
	}

	err = e.registry.ScheduleJob(jobName, machID)
	if err != nil {
		log.Errorf("Failed scheduling Job(%s): %v", jobName, err)
		return err
	}

	log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID)
	return nil
}
Beispiel #8
0
// addrToHostPort takes the given address and parses it into a string suitable
// for use in the 'hostnames' field in a known_hosts file.  For more details,
// see the `SSH_KNOWN_HOSTS FILE FORMAT` section of `man 8 sshd`
func (kc *HostKeyChecker) addrToHostPort(a string) (string, error) {
	if !strings.Contains(a, ":") {
		// No port, so return unadulterated
		return a, nil
	}
	host, p, err := net.SplitHostPort(a)
	if err != nil {
		log.V(1).Infof("Unable to parse addr %s: %v", a, err)
		return "", err
	}

	port, err := strconv.Atoi(p)
	if err != nil {
		log.V(1).Infof("Error parsing port %s: %v", p, err)
		return "", err
	}

	// Default port should be omitted from the entry.
	// (see `put_host_port` in openssh/misc.c)
	if port == 0 || port == sshDefaultPort {
		// IPv6 addresses must be enclosed in square brackets
		if strings.Contains(host, ":") {
			host = fmt.Sprintf("[%s]", host)
		}
		return host, nil
	}

	return fmt.Sprintf("[%s]:%d", host, port), nil
}
Beispiel #9
0
// Pull a Job and its payload from the Registry
func (a *Agent) FetchJob(jobName string) *job.Job {
	log.V(1).Infof("Fetching Job(%s) from Registry", jobName)
	j := a.registry.GetJob(jobName)
	if j == nil {
		log.V(1).Infof("Job not found in Registry")
	}
	return j
}
Beispiel #10
0
// Determine if the Agent can run the provided Job
func (a *Agent) AbleToRun(j *job.Job) bool {
	if !a.VerifyJob(j) {
		log.V(1).Infof("Failed to verify Job(%s)", j.Name)
		return false
	}

	requirements := j.Requirements()
	if len(requirements) == 0 {
		log.V(1).Infof("Job(%s) has no requirements", j.Name)
		return true
	}

	if log.V(1) {
		var reqString string
		for key, slice := range requirements {
			reqString += fmt.Sprintf("%s = [", key)
			for _, val := range slice {
				reqString += fmt.Sprintf("%s, ", val)
			}
			reqString += fmt.Sprint("] ")
		}

		log.Infof("Job(%s) has requirements: %s", j.Name, reqString)
	}

	metadata := extractMachineMetadata(requirements)
	log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata)
	if !a.machine.HasMetadata(metadata) {
		log.V(1).Infof("Unable to run Job(%s), local Machine metadata insufficient", j.Name)
		return false
	}

	bootID, ok := requirements[unit.FleetXConditionMachineBootID]
	if ok && len(bootID) > 0 && !a.machine.State().MatchBootID(bootID[0]) {
		log.V(1).Infof("Agent does not pass MachineBootID condition for Job(%s)", j.Name)
		return false
	}

	peers := j.Payload.Peers()
	if len(peers) > 0 {
		log.V(1).Infof("Asserting required Peers %v of Job(%s) are scheduled locally", peers, j.Name)
		for _, peer := range peers {
			if !a.peerScheduledHere(j.Name, peer) {
				log.V(1).Infof("Required Peer(%s) of Job(%s) is not scheduled locally", peer, j.Name)
				return false
			}
		}
	} else {
		log.V(2).Infof("Job(%s) has no peers to worry about", j.Name)
	}

	if conflicted, conflictedJobName := a.state.HasConflict(j.Name, j.Payload.Conflicts()); conflicted {
		log.V(1).Infof("Job(%s) has conflict with Job(%s)", j.Name, conflictedJobName)
		return false
	}

	return true
}
Beispiel #11
0
// Inform the Registry that a Job must be rescheduled
func (a *Agent) RescheduleJob(j *job.Job) {
	log.V(2).Infof("Stopping Job(%s)", j.Name)
	a.registry.UnscheduleJob(j.Name)

	offer := job.NewOfferFromJob(*j)
	log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name)
	a.registry.CreateJobOffer(offer)
	log.Infof("Published JobOffer(%s)", offer.Job.Name)
}
Beispiel #12
0
func (m *SystemdManager) startUnit(name string) {
	log.V(1).Infof("Starting systemd unit %s", name)

	files := []string{name}
	m.Systemd.EnableUnitFiles(files, true, false)
	log.V(1).Infof("Enabled systemd unit %s", name)

	m.Systemd.StartUnit(name, "replace")
	log.Infof("Started systemd unit %s", name)
}
Beispiel #13
0
// Inform the Registry that a Job must be rescheduled
func (a *Agent) RescheduleJob(j *job.Job) {
	log.V(2).Infof("Stopping Job(%s)", j.Name)
	a.registry.UnscheduleJob(j.Name)

	// TODO(uwedeportivo): agent placing offer ?
	offer := job.NewOfferFromJob(*j, nil)
	log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name)
	a.registry.CreateJobOffer(offer)
	log.Infof("Published JobOffer(%s)", offer.Job.Name)
}
Beispiel #14
0
// Instruct the Agent that an offer has been created and must
// be tracked until it is resolved
func (a *Agent) TrackOffer(jo job.JobOffer) {
	a.state.Lock()
	defer a.state.Unlock()

	log.V(2).Infof("Tracking JobOffer(%s)", jo.Job.Name)
	a.state.TrackOffer(jo)

	peers := jo.Job.Payload.Peers()
	log.V(2).Infof("Tracking peers of JobOffer(%s): %v", jo.Job.Name, peers)
	a.state.TrackJobPeers(jo.Job.Name, jo.Job.Payload.Peers())
}
Beispiel #15
0
func (self *EventHandler) HandleEventJobBidSubmitted(ev event.Event) {
	jb := ev.Payload.(job.JobBid)

	log.V(1).Infof("EventJobBidSubmitted(%s): attempting to schedule Job to Machine(%s)", jb.JobName, jb.MachineBootId)
	err := self.engine.ResolveJobOffer(jb.JobName, jb.MachineBootId)
	if err == nil {
		log.V(1).Infof("EventJobBidSubmitted(%s): successfully scheduled Job to Machine(%s)", jb.JobName, jb.MachineBootId)
	} else {
		log.V(1).Infof("EventJobBidSubmitted(%s): failed to schedule Job to Machine(%s)", jb.JobName, jb.MachineBootId)
	}
}
Beispiel #16
0
// Determine if all necessary peers of a Job are scheduled to this Agent
func (a *Agent) peerScheduledHere(jobName, peerName string) bool {
	log.V(1).Infof("Looking for target of Peer(%s)", peerName)

	//FIXME: ideally the machine would use its own knowledge rather than calling GetJobTarget
	if tgt, _ := a.registry.GetJobTarget(peerName); tgt == "" || tgt != a.Machine.State().ID {
		log.V(1).Infof("Peer(%s) of Job(%s) not scheduled here", peerName, jobName)
		return false
	}

	log.V(1).Infof("Peer(%s) of Job(%s) scheduled here", peerName, jobName)
	return true
}
Beispiel #17
0
// Distribute an Event to all listeners registered to Event.Type
func (eb *EventBus) dispatch(ev *Event) {
	log.V(1).Infof("Dispatching %s to listeners", ev.Type)
	handlerFuncName := fmt.Sprintf("Handle%s", ev.Type)
	for name, listener := range eb.listeners {
		log.V(1).Infof("Looking for event handler func %s on listener %s", handlerFuncName, name)
		handlerFunc := reflect.ValueOf(listener).MethodByName(handlerFuncName)
		if handlerFunc.IsValid() {
			log.V(1).Infof("Calling event handler for %s on listener %s", ev.Type, name)
			go handlerFunc.Call([]reflect.Value{reflect.ValueOf(*ev)})
		}
	}
}
Beispiel #18
0
// Periodically report to the Registry at an interval equal to
// half of the provided ttl. Stop reporting when the provided
// channel is closed. Failed attempts to report state to the
// Registry are retried twice before moving on to the next
// reporting interval.
func (a *Agent) Heartbeat(ttl time.Duration, stop chan bool) {
	attempt := func(attempts int, f func() error) (err error) {
		if attempts < 1 {
			return fmt.Errorf("attempts argument must be 1 or greater, got %d", attempts)
		}

		// The amount of time the retry mechanism waits after a failed attempt
		// doubles following each failure. This is a simple exponential backoff.
		sleep := time.Second

		for i := 1; i <= attempts; i++ {
			err = f()
			if err == nil || i == attempts {
				break
			}

			sleep = sleep * 2
			log.V(2).Infof("function returned err, retrying in %v: %v", sleep, err)
			time.Sleep(sleep)
		}

		return err
	}

	heartbeat := func() error {
		return a.registry.SetMachineState(a.machine.State(), ttl)
	}

	// Explicitly heartbeat immediately to push state to the
	// Registry as quickly as possible
	a.machine.RefreshState()
	if err := attempt(3, heartbeat); err != nil {
		log.Errorf("Failed heartbeat after 3 attempts: %v", err)
	}

	interval := ttl / refreshInterval
	ticker := time.Tick(interval)
	for {
		select {
		case <-stop:
			log.V(2).Info("MachineHeartbeat exiting due to stop signal")
			return
		case <-ticker:
			log.V(2).Info("MachineHeartbeat tick")
			a.machine.RefreshState()
			if err := attempt(3, heartbeat); err != nil {
				log.Errorf("Failed heartbeat after 3 attempts: %v", err)
			}
		}
	}
}
Beispiel #19
0
// bidForPossiblePeers submits bids for all known peers of the provided job that can
// be run locally
func (a *Agent) bidForPossiblePeers(jobName string) {
	peers := a.state.GetJobsByPeer(jobName)

	for _, peer := range peers {
		log.V(1).Infof("Found unresolved offer for Peer(%s) of Job(%s)", peer, jobName)

		peerJob := a.fetchJob(peer)
		if peerJob != nil && a.ableToRun(peerJob) {
			a.bid(peer)
		} else {
			log.V(1).Infof("Unable to bid for Peer(%s) of Job(%s)", peer, jobName)
		}
	}
}
Beispiel #20
0
// ReportUnitState attaches the current state of the Agent's Machine to the given
// unit.UnitState object, then persists that state in the Registry
func (a *Agent) ReportUnitState(jobName string, us *unit.UnitState) {
	if us == nil {
		log.V(1).Infof("Job(%s): purging UnitState from Registry", jobName)
		err := a.registry.RemoveUnitState(jobName)
		if err != nil {
			log.Errorf("Failed to remove UnitState for job %s from Registry: %s", jobName, err.Error())
		}
	} else {
		ms := a.Machine.State()
		us.MachineState = &ms
		log.V(1).Infof("Job(%s): pushing UnitState (loadState=%s, activeState=%s, subState=%s) to Registry", jobName, us.LoadState, us.ActiveState, us.SubState)
		a.registry.SaveUnitState(jobName, us)
	}
}
Beispiel #21
0
func (eh *EventHandler) HandleEventJobUpdated(ev event.Event) {
	j := ev.Payload.(job.Job)

	localBootId := eh.agent.Machine().State().BootId
	targetBootId := ev.Context.(string)

	if targetBootId != localBootId {
		log.V(1).Infof("EventJobUpdated(%s): Job not scheduled to Agent %s, skipping", j.Name, localBootId)
		return
	}

	log.V(1).Infof("EventJobUpdated(%s): Starting Job", j.Name)
	eh.agent.StartJob(&j)
}
Beispiel #22
0
// bidForPossibleJobs submits bids for all unresolved offers whose Jobs
// can be run locally
func (a *Agent) bidForPossibleJobs() {
	offers := a.state.GetOffersWithoutBids()

	log.V(1).Infof("Checking %d unbade offers", len(offers))
	for i := range offers {
		offer := offers[i]
		log.V(1).Infof("Checking ability to run Job(%s)", offer.Job.Name)
		if a.ableToRun(&offer.Job) {
			log.V(1).Infof("Able to run Job(%s), submitting bid", offer.Job.Name)
			a.bid(offer.Job.Name)
		} else {
			log.V(1).Infof("Still unable to run Job(%s)", offer.Job.Name)
		}
	}
}
Beispiel #23
0
func (eh *EventHandler) HandleEventJobOffered(ev event.Event) {
	jo := ev.Payload.(job.JobOffer)
	log.V(1).Infof("EventJobOffered(%s): verifying ability to run Job", jo.Job.Name)

	// Everything we check against could change over time, so we track all
	// offers starting here for future bidding even if we can't bid now
	eh.agent.TrackOffer(jo)

	if eh.agent.AbleToRun(&jo.Job) {
		log.Infof("EventJobOffered(%s): passed all criteria, submitting JobBid", jo.Job.Name)
		eh.agent.Bid(jo.Job.Name)
	} else {
		log.V(1).Infof("EventJobOffered(%s): not all criteria met, not bidding", jo.Job.Name)
	}
}
Beispiel #24
0
// lockResource will attempt to lock a mutex on a resource defined by the
// provided class and id. The context will be persisted to the Registry to
// track by whom the mutex is currently locked.
func (r *Registry) lockResource(class, id, context string) *TimedResourceMutex {
	mutexName := fmt.Sprintf("%s-%s", class, id)
	log.V(2).Infof("Attempting to acquire mutex on %s", mutexName)

	key := path.Join(keyPrefix, mutexPrefix, mutexName)
	resp, err := r.etcd.Create(key, context, uint64(ResourceMutexTTL))

	if err != nil {
		log.V(2).Infof("Failed to acquire mutex on %s", mutexName)
		return nil
	}

	log.V(2).Infof("Successfully acquired mutex on %s", mutexName)
	return &TimedResourceMutex{r.etcd, *resp.Node}
}
Beispiel #25
0
func globMatches(pattern, target string) bool {
	matched, err := path.Match(pattern, target)
	if err != nil {
		log.V(2).Infof("Received error while matching pattern '%s': %v", pattern, err)
	}
	return matched
}
Beispiel #26
0
// verifyJob attempts to verify the integrity of the given Job by checking the
// signature against a SignatureSet stored in its repository.
func (a *Agent) verifyJob(j *job.Job) bool {
	if a.verifier == nil {
		return true
	}
	ss, _ := a.registry.GetSignatureSetOfJob(j.Name)
	ok, err := a.verifier.VerifyJob(j, ss)
	if err != nil {
		log.V(1).Infof("Error verifying signature of Job(%s): %v", j.Name, err)
		return false
	} else if !ok {
		log.V(1).Infof("Job(%s) does not match signature", j.Name)
		return false
	}

	return true
}
Beispiel #27
0
// New creates a new Machine object. The provided parameters will override
// those that might be dynamically generated by the Machine on the fly.
func New(bootId string, publicIP string, metadata map[string]string) *Machine {
	static := MachineState{bootId, publicIP, metadata}
	log.V(2).Infof("Created Machine with static state %s", static)
	m := &Machine{staticState: static}
	m.RefreshState()
	return m
}
Beispiel #28
0
func (eh *EventHandler) HandleEventMachineRemoved(ev event.Event) {
	machID := ev.Payload.(string)
	mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID)
	if mutex == nil {
		log.V(1).Infof("EventMachineRemoved(%s): failed to lock Machine, ignoring event", machID)
		return
	}
	defer mutex.Unlock()

	jobs := getJobsScheduledToMachine(eh.engine.registry, machID)

	for _, j := range jobs {
		log.Infof("EventMachineRemoved(%s): clearing UnitState(%s)", machID, j.Name)
		err := eh.engine.registry.RemoveUnitState(j.Name)
		if err != nil {
			log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err)
		}

		log.Infof("EventMachineRemoved(%s): unscheduling Job(%s)", machID, j.Name)
		eh.engine.registry.ClearJobTarget(j.Name, machID)
	}

	for _, j := range jobs {
		log.Infof("EventMachineRemoved(%s): re-publishing JobOffer(%s)", machID, j.Name)
		eh.engine.OfferJob(j)
	}
	eh.engine.clust.machineRemoved(machID)
}
Beispiel #29
0
func (eh *EventHandler) HandleEventJobStateUpdated(ev event.Event) {
	jobName := ev.Context.(string)
	state := ev.Payload.(*job.JobState)

	if state == nil {
		log.V(1).Infof("EventJobStateUpdated(%s): received nil JobState object", jobName)
	} else {
		log.V(1).Infof("EventJobStateUpdated(%s): pushing state (loadState=%s, activeState=%s, subState=%s) to Registry", jobName, state.LoadState, state.ActiveState, state.SubState)

		// FIXME: This should probably be set in the underlying event-generation code
		ms := eh.agent.Machine().State()
		state.MachineState = &ms
	}

	eh.agent.ReportJobState(jobName, state)
}
Beispiel #30
0
func lazyCreateJobs(args []string, signAndVerify bool) error {
	for _, arg := range args {
		jobName := unitNameMangle(arg)
		if j := registryCtl.GetJob(jobName); j != nil {
			log.V(1).Infof("Found Job(%s) in Registry, no need to recreate it", jobName)
			if signAndVerify {
				if err := verifyJob(j); err != nil {
					return err
				}
			}
			continue
		}

		unit, err := getUnitFromFile(arg)
		if err != nil {
			return fmt.Errorf("Failed getting Unit(%s) from file: %v", jobName, err)
		}

		j, err := createJob(jobName, unit)
		if err != nil {
			return err
		}

		if signAndVerify {
			if err := signJob(j); err != nil {
				return err
			}
		}
	}
	return nil
}