Пример #1
0
func runUnloadUnit(args []string) (exit int) {
	jobs, err := findJobs(args)
	if err != nil {
		fmt.Fprintf(os.Stderr, "%v\n", err)
		return 1
	}

	wait := make([]string, 0)
	for _, j := range jobs {
		if j.State == nil {
			fmt.Fprintf(os.Stderr, "Unable to determine state of %q\n", *(j.State))
			return 1
		}

		if *(j.State) == job.JobStateInactive {
			log.V(1).Infof("Job(%s) already %s, skipping.", j.Name, job.JobStateInactive)
			continue
		}

		log.V(1).Infof("Unloading Job(%s)", j.Name)
		cAPI.SetJobTargetState(j.Name, job.JobStateInactive)
		wait = append(wait, j.Name)
	}

	if !sharedFlags.NoBlock {
		errchan := waitForJobStates(wait, job.JobStateInactive, sharedFlags.BlockAttempts, os.Stdout)
		for err := range errchan {
			fmt.Fprintf(os.Stderr, "%v\n", err)
			exit = 1
		}
	}

	return
}
Пример #2
0
// addrToHostPort takes the given address and parses it into a string suitable
// for use in the 'hostnames' field in a known_hosts file.  For more details,
// see the `SSH_KNOWN_HOSTS FILE FORMAT` section of `man 8 sshd`
func (kc *HostKeyChecker) addrToHostPort(a string) (string, error) {
	if !strings.Contains(a, ":") {
		// No port, so return unadulterated
		return a, nil
	}
	host, p, err := net.SplitHostPort(a)
	if err != nil {
		log.V(1).Infof("Unable to parse addr %s: %v", a, err)
		return "", err
	}

	port, err := strconv.Atoi(p)
	if err != nil {
		log.V(1).Infof("Error parsing port %s: %v", p, err)
		return "", err
	}

	// Default port should be omitted from the entry.
	// (see `put_host_port` in openssh/misc.c)
	if port == 0 || port == sshDefaultPort {
		// IPv6 addresses must be enclosed in square brackets
		if strings.Contains(host, ":") {
			host = fmt.Sprintf("[%s]", host)
		}
		return host, nil
	}

	return fmt.Sprintf("[%s]:%d", host, port), nil
}
Пример #3
0
// bidForPossiblePeers submits bids for all known peers of the provided job that can
// be run locally
func (a *Agent) bidForPossiblePeers(jobName string) {
	peers := a.state.GetJobsByPeer(jobName)

	for _, peer := range peers {
		log.V(1).Infof("Found unresolved offer for Peer(%s) of Job(%s)", peer, jobName)

		peerJob, err := a.registry.Job(peer)
		if err != nil {
			log.Errorf("Failed fetching Job(%s) from Registry: %v", peer, err)
			return
		}

		if peerJob == nil {
			log.V(1).Infof("Unable to find Peer(%s) of Job(%s) in Registry", peer, jobName)
			return
		}

		if !a.ableToRun(peerJob) {
			log.V(1).Infof("Unable to run Peer(%s) of Job(%s), not bidding", peer, jobName)
			return
		}

		a.bid(peer)
	}
}
Пример #4
0
// HasMetadata determine if the Metadata of a given MachineState
// matches the indicated values.
func HasMetadata(state *MachineState, metadata map[string][]string) bool {
	for key, values := range metadata {
		local, ok := state.Metadata[key]
		if !ok {
			log.V(1).Infof("No local values found for Metadata(%s)", key)
			return false
		}

		log.V(1).Infof("Asserting local Metadata(%s) meets requirements", key)

		var localMatch bool
		for _, val := range values {
			if local == val {
				log.V(1).Infof("Local Metadata(%s) meets requirement", key)
				localMatch = true
			}
		}

		if !localMatch {
			log.V(1).Infof("Local Metadata(%s) does not match requirement", key)
			return false
		}
	}

	return true
}
Пример #5
0
func (e *Engine) ResolveJobOffer(jobName string, machID string) error {
	log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName)
	mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID)

	if mutex == nil {
		log.V(1).Infof("Could not lock JobOffer(%s)", jobName)
		return errors.New("could not lock JobOffer")
	}
	defer mutex.Unlock()

	log.V(1).Infof("Claimed JobOffer(%s)", jobName)

	err := e.registry.ResolveJobOffer(jobName)
	if err != nil {
		log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err)
		return err
	}

	err = e.registry.ScheduleJob(jobName, machID)
	if err != nil {
		log.Errorf("Failed scheduling Job(%s): %v", jobName, err)
		return err
	}

	log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID)
	return nil
}
Пример #6
0
func (e *Engine) OfferJob(j job.Job) error {
	log.V(1).Infof("Attempting to lock Job(%s)", j.Name)

	mutex := e.registry.LockJob(j.Name, e.machine.State().ID)
	if mutex == nil {
		log.V(1).Infof("Could not lock Job(%s)", j.Name)
		return errors.New("could not lock Job")
	}
	defer mutex.Unlock()

	log.V(1).Infof("Claimed Job(%s)", j.Name)

	machineIDs, err := e.partitionCluster(&j)
	if err != nil {
		log.Errorf("failed partitioning cluster for Job(%s): %v", j.Name, err)
		return err
	}

	offer := job.NewOfferFromJob(j, machineIDs)

	err = e.registry.CreateJobOffer(offer)
	if err == nil {
		log.Infof("Published JobOffer(%s)", offer.Job.Name)
	}

	return err
}
Пример #7
0
// ParseFilepath expands ~ and ~user constructions.
// If user or $HOME is unknown, do nothing.
func ParseFilepath(path string) string {
	if !strings.HasPrefix(path, "~") {
		return path
	}
	i := strings.Index(path, "/")
	if i < 0 {
		i = len(path)
	}
	var home string
	if i == 1 {
		if home = os.Getenv("HOME"); home == "" {
			usr, err := user.Current()
			if err != nil {
				log.V(1).Infof("Failed to get current home directory: %v", err)
				return path
			}
			home = usr.HomeDir
		}
	} else {
		usr, err := user.Lookup(path[1:i])
		if err != nil {
			log.V(1).Infof("Failed to get %v's home directory: %v", path[1:i], err)
			return path
		}
		home = usr.HomeDir
	}
	path = filepath.Join(home, path[i:])
	return path
}
Пример #8
0
func (ar *actionResolver) one(req *http.Request, cancel <-chan bool) (resp *http.Response, body []byte, err error) {
	log.V(1).Infof("etcd: sending HTTP request %s %s", req.Method, req.URL)
	resp, body, err = ar.requestFunc(req, cancel)
	if err != nil {
		log.V(1).Infof("etcd: recv error response from %s %s: %v", req.Method, req.URL, err)
		return
	}

	log.V(1).Infof("etcd: recv response from %s %s: %s", req.Method, req.URL, resp.Status)
	return
}
Пример #9
0
func (e *Engine) Run(stop chan bool) {
	ticker := time.Tick(reconcileInterval)
	machID := e.machine.State().ID

	reconcile := func() {
		done := make(chan struct{})
		defer func() { close(done) }()
		// While the reconciliation is running, flush the trigger channel in the background
		go func() {
			for {
				select {
				case <-done:
					return
				default:
					select {
					case <-e.trigger:
					case <-done:
						return
					}
				}
			}
		}()

		e.lease = ensureLeader(e.lease, e.registry, machID)
		if e.lease == nil {
			return
		}

		start := time.Now()
		e.rec.Reconcile(e)
		elapsed := time.Now().Sub(start)

		msg := fmt.Sprintf("Engine completed reconciliation in %s", elapsed)
		if elapsed > reconcileInterval {
			log.Warning(msg)
		} else {
			log.V(1).Info(msg)
		}
	}

	for {
		select {
		case <-stop:
			log.V(1).Info("Engine exiting due to stop signal")
			return
		case <-ticker:
			log.V(1).Info("Engine tick")
			reconcile()
		case <-e.trigger:
			log.V(1).Info("Engine reconcilation triggered by job state change")
			reconcile()
		}
	}
}
Пример #10
0
// bidForPossibleJobs submits bids for all unresolved offers whose Jobs
// can be run locally
func (a *Agent) bidForPossibleJobs() {
	offers := a.state.GetOffersWithoutBids()

	log.V(1).Infof("Checking %d unbade offers", len(offers))
	for i := range offers {
		offer := offers[i]
		log.V(1).Infof("Checking ability to run Job(%s)", offer.Job.Name)
		if a.ableToRun(&offer.Job) {
			log.V(1).Infof("Able to run Job(%s), submitting bid", offer.Job.Name)
			a.bid(offer.Job.Name)
		} else {
			log.V(1).Infof("Still unable to run Job(%s)", offer.Job.Name)
		}
	}
}
Пример #11
0
func watch(client etcd.Client, idx uint64, etcdchan chan *etcd.Result, key string, stop chan bool) {
	for {
		select {
		case <-stop:
			log.V(1).Infof("Gracefully closing etcd watch loop: key=%s", key)
			return
		default:
			req := &etcd.Watch{
				Key:       key,
				WaitIndex: idx,
				Recursive: true,
			}

			log.V(1).Infof("Creating etcd watcher: %v", req)

			resp, err := client.Wait(req, stop)
			if err == nil {
				if resp.Node != nil {
					idx = resp.Node.ModifiedIndex + 1
				}
				etcdchan <- resp
				continue
			}

			log.Errorf("etcd watcher %v returned error: %v", req, err)

			etcdError, ok := err.(etcd.Error)
			if !ok {
				// Let's not slam the etcd server in the event that we know
				// an unexpected error occurred.
				time.Sleep(time.Second)
				continue
			}

			switch etcdError.ErrorCode {
			case etcd.ErrorEventIndexCleared:
				// This is racy, but adding one to the last known index
				// will help get this watcher back into the range of
				// etcd's internal event history
				idx = idx + 1
			default:
				// Let's not slam the etcd server in the event that we know
				// an unexpected error occurred.
				time.Sleep(time.Second)
			}
		}
	}
}
Пример #12
0
func (eh *EventHandler) handleLossOfMachineEvents(ev event.Event) {
	machID := ev.Payload.(string)
	mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID)
	if mutex == nil {
		log.V(1).Infof("%s(%s): failed to lock Machine, ignoring event", ev.Type, machID)
		return
	}
	defer mutex.Unlock()

	jobs := getJobsScheduledToMachine(eh.engine.registry, machID)

	for _, j := range jobs {
		log.Infof("%s(%s): clearing UnitState(%s)", ev.Type, machID, j.Name)
		err := eh.engine.registry.RemoveUnitState(j.Name)
		if err != nil {
			log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err)
		}

		log.Infof("%s(%s): unscheduling Job(%s)", ev.Type, machID, j.Name)
		eh.engine.registry.ClearJobTarget(j.Name, machID)
	}

	for _, j := range jobs {
		log.Infof("%s(%s): re-publishing JobOffer(%s)", ev.Type, machID, j.Name)
		eh.engine.OfferJob(j)
	}
	eh.engine.clust.machineRemoved(machID)
}
Пример #13
0
// verifyJobSignature attempts to verify the integrity of the given Job by checking the
// signature against a SignatureSet stored in the Registry
func (ar *AgentReconciler) verifyJobSignature(j *job.Job) bool {
	if ar.verifier == nil {
		return true
	}
	ss, _ := ar.reg.JobSignatureSet(j.Name)
	ok, err := ar.verifier.VerifyJob(j, ss)
	if err != nil {
		log.V(1).Infof("Error verifying signature of Job(%s): %v", j.Name, err)
		return false
	} else if !ok {
		log.V(1).Infof("Job(%s) does not match signature", j.Name)
		return false
	}

	return true
}
Пример #14
0
func findAddressInMachineList(lookup string) (string, bool) {
	states, err := cAPI.Machines()
	if err != nil {
		log.V(1).Infof("Unable to retrieve list of active machines from the Registry: %v", err)
		return "", false
	}

	var match *machine.MachineState

	for i := range states {
		machState := states[i]
		if !strings.HasPrefix(machState.ID, lookup) {
			continue
		} else if match != nil {
			fmt.Fprintln(os.Stderr, "Found more than one Machine, be more specific.")
			os.Exit(1)
		}
		match = &machState
	}

	if match == nil {
		return "", false
	}
	return match.PublicIP, true
}
Пример #15
0
// check attempts to beat a Heart several times within a timeout, returning the
// log index at which the beat succeeded or an error
func (m *Monitor) check(hrt Heart) (idx uint64, err error) {
	// time out after a third of the machine presence TTL, attempting
	// the heartbeat up to four times
	timeout := m.TTL / 3
	interval := timeout / 4

	tchan := time.After(timeout)
	next := time.After(0)
	for idx == 0 {
		select {
		case <-tchan:
			err = errors.New("Monitor timed out before successful heartbeat")
			return
		case <-next:
			idx, err = hrt.Beat(m.TTL)
			if err != nil {
				log.V(1).Infof("Monitor heartbeat function returned err, retrying in %v: %v", interval, err)
			}

			next = time.After(interval)
		}
	}

	return
}
Пример #16
0
func globMatches(pattern, target string) bool {
	matched, err := path.Match(pattern, target)
	if err != nil {
		log.V(1).Infof("Received error while matching pattern '%s': %v", pattern, err)
	}
	return matched
}
Пример #17
0
func NewCoreOSMachine(static MachineState, um unit.UnitManager) *CoreOSMachine {
	log.V(1).Infof("Created CoreOSMachine with static state %s", static)
	m := &CoreOSMachine{
		staticState: static,
		um:          um,
	}
	return m
}
Пример #18
0
// Run periodically attempts to reconcile the provided Agent until the stop
// channel is closed. Run will also reconcile in reaction to calls to Trigger.
// While a reconciliation is being attempted, calls to Trigger are ignored.
func (ar *AgentReconciler) Run(a *Agent, stop chan bool) {
	ticker := time.Tick(reconcileInterval)

	reconcile := func() {
		done := make(chan struct{})
		defer close(done)
		// While the reconciliation is running, flush the trigger channel in the background
		go func() {
			for {
				select {
				case <-done:
					return
				default:
					select {
					case <-ar.rTrigger:
					case <-done:
						return
					}
				}
			}
		}()

		start := time.Now()
		ar.Reconcile(a)
		elapsed := time.Now().Sub(start)

		msg := fmt.Sprintf("AgentReconciler completed reconciliation in %s", elapsed)
		if elapsed > reconcileInterval {
			log.Warning(msg)
		} else {
			log.V(1).Info(msg)
		}
	}

	for {
		select {
		case <-stop:
			log.V(1).Info("AgentReconciler exiting due to stop signal")
			return
		case <-ticker:
			reconcile()
		case <-ar.rTrigger:
			reconcile()
		}
	}
}
Пример #19
0
func (ar *AgentReconciler) calculateTasksForOffer(dState *agentState, ms *machine.MachineState, j *job.Job, bids pkg.Set, taskchan chan *task) {
	if bids.Contains(ms.ID) {
		log.V(1).Infof("Bid already submitted for unresolved JobOffer(%s)", j.Name)
		return
	}

	if able, reason := ar.ableToRun(dState, ms, j); !able {
		log.V(1).Infof("Not bidding on Job(%s): %s", j.Name, reason)
		return
	}

	taskchan <- &task{
		Type:   taskTypeSubmitBid,
		Job:    j,
		Reason: taskReasonAbleToResolveOffer,
	}
}
Пример #20
0
// ableToRun determines if the Agent can run the provided Job, and returns a boolean indicating
// whether this is the case. There are five criteria for an Agent to be eligible to run a Job:
//   - Job must pass signature verification
//   - agent must have all of the Job's required metadata (if any)
//   - agent must meet the Job's machine target requirement (if any)
//   - agent must have all required Peers of the Job scheduled locally (if any)
//   - Job must not conflict with any other Jobs scheduled to the agent
func (a *Agent) ableToRun(j *job.Job) bool {
	if !a.verifyJobSignature(j) {
		log.V(1).Infof("Failed to verify Job(%s)", j.Name)
		return false
	}

	log.Infof("Job(%s) has requirements: %s", j.Name, j.Requirements())

	metadata := j.RequiredTargetMetadata()
	if len(metadata) == 0 {
		log.V(1).Infof("Job(%s) has no required machine metadata", j.Name)
	} else {
		log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata)
		ms := a.Machine.State()
		if !machine.HasMetadata(&ms, metadata) {
			log.Infof("Unable to run Job(%s): local Machine metadata insufficient", j.Name)
			return false
		}
	}

	if tgt, ok := j.RequiredTarget(); ok && !a.Machine.State().MatchID(tgt) {
		log.Infof("Unable to run Job(%s): agent does not meet machine target requirement (%s)", j.Name, tgt)
		return false
	}

	peers := j.Peers()
	if len(peers) == 0 {
		log.V(1).Infof("Job(%s) has no required peers", j.Name)
	} else {
		log.V(1).Infof("Job(%s) requires peers: %v", j.Name, peers)
		for _, peer := range peers {
			if !a.peerScheduledHere(j.Name, peer) {
				log.Infof("Unable to run Job(%s): required Peer(%s) is not scheduled locally", j.Name, peer)
				return false
			}
		}
	}

	if conflicted, conflictedJobName := a.HasConflict(j.Name, j.Conflicts()); conflicted {
		log.Infof("Unable to run Job(%s): conflict with Job(%s)", j.Name, conflictedJobName)
		return false
	}

	return true
}
Пример #21
0
// heartbeatAgent periodically reports to the Registry at an
// interval equal to half of the provided ttl. heartbeatAgent
// stops reporting when the provided channel is closed. Failed
// attempts to report state to the Registry are retried twice
// before moving on to the next reporting interval.
func (a *Agent) heartbeatAgent(ttl time.Duration, stop chan bool) {
	attempt := func(attempts int, f func() error) (err error) {
		if attempts < 1 {
			return fmt.Errorf("attempts argument must be 1 or greater, got %d", attempts)
		}

		// The amount of time the retry mechanism waits after a failed attempt
		// doubles following each failure. This is a simple exponential backoff.
		sleep := time.Second

		for i := 1; i <= attempts; i++ {
			err = f()
			if err == nil || i == attempts {
				break
			}

			sleep = sleep * 2
			log.V(1).Infof("function returned err, retrying in %v: %v", sleep, err)
			time.Sleep(sleep)
		}

		return err
	}

	heartbeat := func() error {
		_, err := a.registry.SetMachineState(a.Machine.State(), ttl)
		return err
	}

	interval := ttl / refreshInterval
	ticker := time.Tick(interval)
	for {
		select {
		case <-stop:
			log.V(1).Info("Heartbeat exiting due to stop signal")
			return
		case <-ticker:
			log.V(1).Info("Heartbeat tick")
			if err := attempt(3, heartbeat); err != nil {
				log.Errorf("Failed heartbeat after 3 attempts: %v", err)
			}
		}
	}
}
Пример #22
0
// lockResource will attempt to lock a mutex on a resource defined by the
// provided class and id. The context will be persisted to the Registry to
// track by whom the mutex is currently locked.
func (r *EtcdRegistry) lockResource(class, id, context string) *TimedResourceMutex {
	mutexName := fmt.Sprintf("%s-%s", class, id)
	log.V(1).Infof("Attempting to acquire mutex on %s", mutexName)

	req := etcd.Create{
		Key:   path.Join(r.keyPrefix, mutexPrefix, mutexName),
		Value: context,
		TTL:   ResourceMutexTTL,
	}

	resp, err := r.etcd.Do(&req)
	if err != nil {
		log.V(1).Infof("Failed to acquire mutex on %s", mutexName)
		return nil
	}

	log.V(1).Infof("Successfully acquired mutex on %s", mutexName)
	return &TimedResourceMutex{r.etcd, *resp.Node}
}
Пример #23
0
// Determine if all necessary peers of a Job are scheduled to this Agent
func (a *Agent) peerScheduledHere(jobName, peerName string) bool {
	log.V(1).Infof("Looking for target of Peer(%s)", peerName)

	j, err := a.registry.Job(peerName)
	if err != nil {
		log.Errorf("Failed retrieving Job(%s) from Registry: %v", peerName, err)
		return false
	} else if j == nil {
		return false
	}

	if j.TargetMachineID == "" || j.TargetMachineID != a.Machine.State().ID {
		log.V(1).Infof("Peer(%s) of Job(%s) not scheduled here", peerName, jobName)
		return false
	}

	log.V(1).Infof("Peer(%s) of Job(%s) scheduled here", peerName, jobName)
	return true
}
Пример #24
0
func pipe(etcdchan chan *etcd.Result, filters []func(res *etcd.Result) *event.Event, sendFunc func(*event.Event), stop chan bool) {
	for true {
		select {
		case <-stop:
			return
		case res := <-etcdchan:
			log.V(1).Infof("Received %v from etcd watch", res)
			for _, f := range filters {
				ev := f(res)
				if ev == nil {
					continue
				}

				log.V(1).Infof("Translated %v to Event(Type=%s)", res, ev.Type)
				sendFunc(ev)
			}
		}
	}
}
Пример #25
0
func createJob(jobName string, unit *unit.Unit) (*job.Job, error) {
	j := job.NewJob(jobName, *unit)

	if err := cAPI.CreateJob(j); err != nil {
		return nil, fmt.Errorf("failed creating job %s: %v", j.Name, err)
	}

	log.V(1).Infof("Created Job(%s) in Registry", j.Name)

	return j, nil
}
Пример #26
0
func (eh *EventHandler) HandleEventJobOffered(ev event.Event) {
	jo := ev.Payload.(job.JobOffer)

	if !jo.OfferedTo(eh.agent.Machine.State().ID) {
		log.V(1).Infof("EventJobOffered(%s): not offered to this machine, ignoring", jo.Job.Name)
		return
	}

	log.Infof("EventJobOffered(%s): deciding whether to bid or not", jo.Job.Name)
	eh.agent.MaybeBid(jo)
}
Пример #27
0
func findAddressInRunningUnits(jobName string) (string, bool) {
	name := unitNameMangle(jobName)
	j, err := cAPI.Job(name)
	if err != nil {
		log.V(1).Infof("Unable to retrieve Job(%s) from Repository: %v", name, err)
	}
	if j == nil || j.UnitState == nil {
		return "", false
	}
	return j.UnitState.MachineState.PublicIP, true
}
Пример #28
0
// getUnitFromFile attempts to load a Unit from a given filename
// It returns the Unit or nil, and any error encountered
func getUnitFromFile(file string) (*unit.Unit, error) {
	out, err := ioutil.ReadFile(file)
	if err != nil {
		return nil, err
	}

	unitName := path.Base(file)
	log.V(1).Infof("Unit(%s) found in local filesystem", unitName)

	return unit.NewUnit(string(out))
}
Пример #29
0
func (eh *EventHandler) HandleEventUnitStateUpdated(ev event.Event) {
	jobName := ev.Context.(string)
	state := ev.Payload.(*unit.UnitState)

	if state == nil {
		log.V(1).Infof("EventUnitStateUpdated(%s): received nil UnitState object, ignoring", jobName)
		return
	}

	log.Infof("EventUnitStateUpdated(%s): reporting state to Registry", jobName)
	eh.agent.ReportUnitState(jobName, state)
}
Пример #30
0
func (eh *EventHandler) HandleCommandStopJob(ev event.Event) {
	jobName := ev.Payload.(string)
	target := ev.Context.(string)

	if target != eh.agent.Machine.State().ID {
		log.V(1).Infof("CommandStopJob(%s): scheduled elsewhere, ignoring", jobName)
		return
	}

	log.Infof("CommandStopJob(%s): instructing Agent to stop Job", jobName)
	eh.agent.StopJob(jobName)
}