func runUnloadUnit(args []string) (exit int) { jobs, err := findJobs(args) if err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) return 1 } wait := make([]string, 0) for _, j := range jobs { if j.State == nil { fmt.Fprintf(os.Stderr, "Unable to determine state of %q\n", *(j.State)) return 1 } if *(j.State) == job.JobStateInactive { log.V(1).Infof("Job(%s) already %s, skipping.", j.Name, job.JobStateInactive) continue } log.V(1).Infof("Unloading Job(%s)", j.Name) cAPI.SetJobTargetState(j.Name, job.JobStateInactive) wait = append(wait, j.Name) } if !sharedFlags.NoBlock { errchan := waitForJobStates(wait, job.JobStateInactive, sharedFlags.BlockAttempts, os.Stdout) for err := range errchan { fmt.Fprintf(os.Stderr, "%v\n", err) exit = 1 } } return }
// addrToHostPort takes the given address and parses it into a string suitable // for use in the 'hostnames' field in a known_hosts file. For more details, // see the `SSH_KNOWN_HOSTS FILE FORMAT` section of `man 8 sshd` func (kc *HostKeyChecker) addrToHostPort(a string) (string, error) { if !strings.Contains(a, ":") { // No port, so return unadulterated return a, nil } host, p, err := net.SplitHostPort(a) if err != nil { log.V(1).Infof("Unable to parse addr %s: %v", a, err) return "", err } port, err := strconv.Atoi(p) if err != nil { log.V(1).Infof("Error parsing port %s: %v", p, err) return "", err } // Default port should be omitted from the entry. // (see `put_host_port` in openssh/misc.c) if port == 0 || port == sshDefaultPort { // IPv6 addresses must be enclosed in square brackets if strings.Contains(host, ":") { host = fmt.Sprintf("[%s]", host) } return host, nil } return fmt.Sprintf("[%s]:%d", host, port), nil }
// bidForPossiblePeers submits bids for all known peers of the provided job that can // be run locally func (a *Agent) bidForPossiblePeers(jobName string) { peers := a.state.GetJobsByPeer(jobName) for _, peer := range peers { log.V(1).Infof("Found unresolved offer for Peer(%s) of Job(%s)", peer, jobName) peerJob, err := a.registry.Job(peer) if err != nil { log.Errorf("Failed fetching Job(%s) from Registry: %v", peer, err) return } if peerJob == nil { log.V(1).Infof("Unable to find Peer(%s) of Job(%s) in Registry", peer, jobName) return } if !a.ableToRun(peerJob) { log.V(1).Infof("Unable to run Peer(%s) of Job(%s), not bidding", peer, jobName) return } a.bid(peer) } }
// HasMetadata determine if the Metadata of a given MachineState // matches the indicated values. func HasMetadata(state *MachineState, metadata map[string][]string) bool { for key, values := range metadata { local, ok := state.Metadata[key] if !ok { log.V(1).Infof("No local values found for Metadata(%s)", key) return false } log.V(1).Infof("Asserting local Metadata(%s) meets requirements", key) var localMatch bool for _, val := range values { if local == val { log.V(1).Infof("Local Metadata(%s) meets requirement", key) localMatch = true } } if !localMatch { log.V(1).Infof("Local Metadata(%s) does not match requirement", key) return false } } return true }
func (e *Engine) ResolveJobOffer(jobName string, machID string) error { log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName) mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock JobOffer(%s)", jobName) return errors.New("could not lock JobOffer") } defer mutex.Unlock() log.V(1).Infof("Claimed JobOffer(%s)", jobName) err := e.registry.ResolveJobOffer(jobName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err) return err } err = e.registry.ScheduleJob(jobName, machID) if err != nil { log.Errorf("Failed scheduling Job(%s): %v", jobName, err) return err } log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID) return nil }
func (e *Engine) OfferJob(j job.Job) error { log.V(1).Infof("Attempting to lock Job(%s)", j.Name) mutex := e.registry.LockJob(j.Name, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock Job(%s)", j.Name) return errors.New("could not lock Job") } defer mutex.Unlock() log.V(1).Infof("Claimed Job(%s)", j.Name) machineIDs, err := e.partitionCluster(&j) if err != nil { log.Errorf("failed partitioning cluster for Job(%s): %v", j.Name, err) return err } offer := job.NewOfferFromJob(j, machineIDs) err = e.registry.CreateJobOffer(offer) if err == nil { log.Infof("Published JobOffer(%s)", offer.Job.Name) } return err }
// ParseFilepath expands ~ and ~user constructions. // If user or $HOME is unknown, do nothing. func ParseFilepath(path string) string { if !strings.HasPrefix(path, "~") { return path } i := strings.Index(path, "/") if i < 0 { i = len(path) } var home string if i == 1 { if home = os.Getenv("HOME"); home == "" { usr, err := user.Current() if err != nil { log.V(1).Infof("Failed to get current home directory: %v", err) return path } home = usr.HomeDir } } else { usr, err := user.Lookup(path[1:i]) if err != nil { log.V(1).Infof("Failed to get %v's home directory: %v", path[1:i], err) return path } home = usr.HomeDir } path = filepath.Join(home, path[i:]) return path }
func (ar *actionResolver) one(req *http.Request, cancel <-chan bool) (resp *http.Response, body []byte, err error) { log.V(1).Infof("etcd: sending HTTP request %s %s", req.Method, req.URL) resp, body, err = ar.requestFunc(req, cancel) if err != nil { log.V(1).Infof("etcd: recv error response from %s %s: %v", req.Method, req.URL, err) return } log.V(1).Infof("etcd: recv response from %s %s: %s", req.Method, req.URL, resp.Status) return }
func (e *Engine) Run(stop chan bool) { ticker := time.Tick(reconcileInterval) machID := e.machine.State().ID reconcile := func() { done := make(chan struct{}) defer func() { close(done) }() // While the reconciliation is running, flush the trigger channel in the background go func() { for { select { case <-done: return default: select { case <-e.trigger: case <-done: return } } } }() e.lease = ensureLeader(e.lease, e.registry, machID) if e.lease == nil { return } start := time.Now() e.rec.Reconcile(e) elapsed := time.Now().Sub(start) msg := fmt.Sprintf("Engine completed reconciliation in %s", elapsed) if elapsed > reconcileInterval { log.Warning(msg) } else { log.V(1).Info(msg) } } for { select { case <-stop: log.V(1).Info("Engine exiting due to stop signal") return case <-ticker: log.V(1).Info("Engine tick") reconcile() case <-e.trigger: log.V(1).Info("Engine reconcilation triggered by job state change") reconcile() } } }
// bidForPossibleJobs submits bids for all unresolved offers whose Jobs // can be run locally func (a *Agent) bidForPossibleJobs() { offers := a.state.GetOffersWithoutBids() log.V(1).Infof("Checking %d unbade offers", len(offers)) for i := range offers { offer := offers[i] log.V(1).Infof("Checking ability to run Job(%s)", offer.Job.Name) if a.ableToRun(&offer.Job) { log.V(1).Infof("Able to run Job(%s), submitting bid", offer.Job.Name) a.bid(offer.Job.Name) } else { log.V(1).Infof("Still unable to run Job(%s)", offer.Job.Name) } } }
func watch(client etcd.Client, idx uint64, etcdchan chan *etcd.Result, key string, stop chan bool) { for { select { case <-stop: log.V(1).Infof("Gracefully closing etcd watch loop: key=%s", key) return default: req := &etcd.Watch{ Key: key, WaitIndex: idx, Recursive: true, } log.V(1).Infof("Creating etcd watcher: %v", req) resp, err := client.Wait(req, stop) if err == nil { if resp.Node != nil { idx = resp.Node.ModifiedIndex + 1 } etcdchan <- resp continue } log.Errorf("etcd watcher %v returned error: %v", req, err) etcdError, ok := err.(etcd.Error) if !ok { // Let's not slam the etcd server in the event that we know // an unexpected error occurred. time.Sleep(time.Second) continue } switch etcdError.ErrorCode { case etcd.ErrorEventIndexCleared: // This is racy, but adding one to the last known index // will help get this watcher back into the range of // etcd's internal event history idx = idx + 1 default: // Let's not slam the etcd server in the event that we know // an unexpected error occurred. time.Sleep(time.Second) } } } }
func (eh *EventHandler) handleLossOfMachineEvents(ev event.Event) { machID := ev.Payload.(string) mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID) if mutex == nil { log.V(1).Infof("%s(%s): failed to lock Machine, ignoring event", ev.Type, machID) return } defer mutex.Unlock() jobs := getJobsScheduledToMachine(eh.engine.registry, machID) for _, j := range jobs { log.Infof("%s(%s): clearing UnitState(%s)", ev.Type, machID, j.Name) err := eh.engine.registry.RemoveUnitState(j.Name) if err != nil { log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err) } log.Infof("%s(%s): unscheduling Job(%s)", ev.Type, machID, j.Name) eh.engine.registry.ClearJobTarget(j.Name, machID) } for _, j := range jobs { log.Infof("%s(%s): re-publishing JobOffer(%s)", ev.Type, machID, j.Name) eh.engine.OfferJob(j) } eh.engine.clust.machineRemoved(machID) }
// verifyJobSignature attempts to verify the integrity of the given Job by checking the // signature against a SignatureSet stored in the Registry func (ar *AgentReconciler) verifyJobSignature(j *job.Job) bool { if ar.verifier == nil { return true } ss, _ := ar.reg.JobSignatureSet(j.Name) ok, err := ar.verifier.VerifyJob(j, ss) if err != nil { log.V(1).Infof("Error verifying signature of Job(%s): %v", j.Name, err) return false } else if !ok { log.V(1).Infof("Job(%s) does not match signature", j.Name) return false } return true }
func findAddressInMachineList(lookup string) (string, bool) { states, err := cAPI.Machines() if err != nil { log.V(1).Infof("Unable to retrieve list of active machines from the Registry: %v", err) return "", false } var match *machine.MachineState for i := range states { machState := states[i] if !strings.HasPrefix(machState.ID, lookup) { continue } else if match != nil { fmt.Fprintln(os.Stderr, "Found more than one Machine, be more specific.") os.Exit(1) } match = &machState } if match == nil { return "", false } return match.PublicIP, true }
// check attempts to beat a Heart several times within a timeout, returning the // log index at which the beat succeeded or an error func (m *Monitor) check(hrt Heart) (idx uint64, err error) { // time out after a third of the machine presence TTL, attempting // the heartbeat up to four times timeout := m.TTL / 3 interval := timeout / 4 tchan := time.After(timeout) next := time.After(0) for idx == 0 { select { case <-tchan: err = errors.New("Monitor timed out before successful heartbeat") return case <-next: idx, err = hrt.Beat(m.TTL) if err != nil { log.V(1).Infof("Monitor heartbeat function returned err, retrying in %v: %v", interval, err) } next = time.After(interval) } } return }
func globMatches(pattern, target string) bool { matched, err := path.Match(pattern, target) if err != nil { log.V(1).Infof("Received error while matching pattern '%s': %v", pattern, err) } return matched }
func NewCoreOSMachine(static MachineState, um unit.UnitManager) *CoreOSMachine { log.V(1).Infof("Created CoreOSMachine with static state %s", static) m := &CoreOSMachine{ staticState: static, um: um, } return m }
// Run periodically attempts to reconcile the provided Agent until the stop // channel is closed. Run will also reconcile in reaction to calls to Trigger. // While a reconciliation is being attempted, calls to Trigger are ignored. func (ar *AgentReconciler) Run(a *Agent, stop chan bool) { ticker := time.Tick(reconcileInterval) reconcile := func() { done := make(chan struct{}) defer close(done) // While the reconciliation is running, flush the trigger channel in the background go func() { for { select { case <-done: return default: select { case <-ar.rTrigger: case <-done: return } } } }() start := time.Now() ar.Reconcile(a) elapsed := time.Now().Sub(start) msg := fmt.Sprintf("AgentReconciler completed reconciliation in %s", elapsed) if elapsed > reconcileInterval { log.Warning(msg) } else { log.V(1).Info(msg) } } for { select { case <-stop: log.V(1).Info("AgentReconciler exiting due to stop signal") return case <-ticker: reconcile() case <-ar.rTrigger: reconcile() } } }
func (ar *AgentReconciler) calculateTasksForOffer(dState *agentState, ms *machine.MachineState, j *job.Job, bids pkg.Set, taskchan chan *task) { if bids.Contains(ms.ID) { log.V(1).Infof("Bid already submitted for unresolved JobOffer(%s)", j.Name) return } if able, reason := ar.ableToRun(dState, ms, j); !able { log.V(1).Infof("Not bidding on Job(%s): %s", j.Name, reason) return } taskchan <- &task{ Type: taskTypeSubmitBid, Job: j, Reason: taskReasonAbleToResolveOffer, } }
// ableToRun determines if the Agent can run the provided Job, and returns a boolean indicating // whether this is the case. There are five criteria for an Agent to be eligible to run a Job: // - Job must pass signature verification // - agent must have all of the Job's required metadata (if any) // - agent must meet the Job's machine target requirement (if any) // - agent must have all required Peers of the Job scheduled locally (if any) // - Job must not conflict with any other Jobs scheduled to the agent func (a *Agent) ableToRun(j *job.Job) bool { if !a.verifyJobSignature(j) { log.V(1).Infof("Failed to verify Job(%s)", j.Name) return false } log.Infof("Job(%s) has requirements: %s", j.Name, j.Requirements()) metadata := j.RequiredTargetMetadata() if len(metadata) == 0 { log.V(1).Infof("Job(%s) has no required machine metadata", j.Name) } else { log.V(1).Infof("Job(%s) requires machine metadata: %v", j.Name, metadata) ms := a.Machine.State() if !machine.HasMetadata(&ms, metadata) { log.Infof("Unable to run Job(%s): local Machine metadata insufficient", j.Name) return false } } if tgt, ok := j.RequiredTarget(); ok && !a.Machine.State().MatchID(tgt) { log.Infof("Unable to run Job(%s): agent does not meet machine target requirement (%s)", j.Name, tgt) return false } peers := j.Peers() if len(peers) == 0 { log.V(1).Infof("Job(%s) has no required peers", j.Name) } else { log.V(1).Infof("Job(%s) requires peers: %v", j.Name, peers) for _, peer := range peers { if !a.peerScheduledHere(j.Name, peer) { log.Infof("Unable to run Job(%s): required Peer(%s) is not scheduled locally", j.Name, peer) return false } } } if conflicted, conflictedJobName := a.HasConflict(j.Name, j.Conflicts()); conflicted { log.Infof("Unable to run Job(%s): conflict with Job(%s)", j.Name, conflictedJobName) return false } return true }
// heartbeatAgent periodically reports to the Registry at an // interval equal to half of the provided ttl. heartbeatAgent // stops reporting when the provided channel is closed. Failed // attempts to report state to the Registry are retried twice // before moving on to the next reporting interval. func (a *Agent) heartbeatAgent(ttl time.Duration, stop chan bool) { attempt := func(attempts int, f func() error) (err error) { if attempts < 1 { return fmt.Errorf("attempts argument must be 1 or greater, got %d", attempts) } // The amount of time the retry mechanism waits after a failed attempt // doubles following each failure. This is a simple exponential backoff. sleep := time.Second for i := 1; i <= attempts; i++ { err = f() if err == nil || i == attempts { break } sleep = sleep * 2 log.V(1).Infof("function returned err, retrying in %v: %v", sleep, err) time.Sleep(sleep) } return err } heartbeat := func() error { _, err := a.registry.SetMachineState(a.Machine.State(), ttl) return err } interval := ttl / refreshInterval ticker := time.Tick(interval) for { select { case <-stop: log.V(1).Info("Heartbeat exiting due to stop signal") return case <-ticker: log.V(1).Info("Heartbeat tick") if err := attempt(3, heartbeat); err != nil { log.Errorf("Failed heartbeat after 3 attempts: %v", err) } } } }
// lockResource will attempt to lock a mutex on a resource defined by the // provided class and id. The context will be persisted to the Registry to // track by whom the mutex is currently locked. func (r *EtcdRegistry) lockResource(class, id, context string) *TimedResourceMutex { mutexName := fmt.Sprintf("%s-%s", class, id) log.V(1).Infof("Attempting to acquire mutex on %s", mutexName) req := etcd.Create{ Key: path.Join(r.keyPrefix, mutexPrefix, mutexName), Value: context, TTL: ResourceMutexTTL, } resp, err := r.etcd.Do(&req) if err != nil { log.V(1).Infof("Failed to acquire mutex on %s", mutexName) return nil } log.V(1).Infof("Successfully acquired mutex on %s", mutexName) return &TimedResourceMutex{r.etcd, *resp.Node} }
// Determine if all necessary peers of a Job are scheduled to this Agent func (a *Agent) peerScheduledHere(jobName, peerName string) bool { log.V(1).Infof("Looking for target of Peer(%s)", peerName) j, err := a.registry.Job(peerName) if err != nil { log.Errorf("Failed retrieving Job(%s) from Registry: %v", peerName, err) return false } else if j == nil { return false } if j.TargetMachineID == "" || j.TargetMachineID != a.Machine.State().ID { log.V(1).Infof("Peer(%s) of Job(%s) not scheduled here", peerName, jobName) return false } log.V(1).Infof("Peer(%s) of Job(%s) scheduled here", peerName, jobName) return true }
func pipe(etcdchan chan *etcd.Result, filters []func(res *etcd.Result) *event.Event, sendFunc func(*event.Event), stop chan bool) { for true { select { case <-stop: return case res := <-etcdchan: log.V(1).Infof("Received %v from etcd watch", res) for _, f := range filters { ev := f(res) if ev == nil { continue } log.V(1).Infof("Translated %v to Event(Type=%s)", res, ev.Type) sendFunc(ev) } } } }
func createJob(jobName string, unit *unit.Unit) (*job.Job, error) { j := job.NewJob(jobName, *unit) if err := cAPI.CreateJob(j); err != nil { return nil, fmt.Errorf("failed creating job %s: %v", j.Name, err) } log.V(1).Infof("Created Job(%s) in Registry", j.Name) return j, nil }
func (eh *EventHandler) HandleEventJobOffered(ev event.Event) { jo := ev.Payload.(job.JobOffer) if !jo.OfferedTo(eh.agent.Machine.State().ID) { log.V(1).Infof("EventJobOffered(%s): not offered to this machine, ignoring", jo.Job.Name) return } log.Infof("EventJobOffered(%s): deciding whether to bid or not", jo.Job.Name) eh.agent.MaybeBid(jo) }
func findAddressInRunningUnits(jobName string) (string, bool) { name := unitNameMangle(jobName) j, err := cAPI.Job(name) if err != nil { log.V(1).Infof("Unable to retrieve Job(%s) from Repository: %v", name, err) } if j == nil || j.UnitState == nil { return "", false } return j.UnitState.MachineState.PublicIP, true }
// getUnitFromFile attempts to load a Unit from a given filename // It returns the Unit or nil, and any error encountered func getUnitFromFile(file string) (*unit.Unit, error) { out, err := ioutil.ReadFile(file) if err != nil { return nil, err } unitName := path.Base(file) log.V(1).Infof("Unit(%s) found in local filesystem", unitName) return unit.NewUnit(string(out)) }
func (eh *EventHandler) HandleEventUnitStateUpdated(ev event.Event) { jobName := ev.Context.(string) state := ev.Payload.(*unit.UnitState) if state == nil { log.V(1).Infof("EventUnitStateUpdated(%s): received nil UnitState object, ignoring", jobName) return } log.Infof("EventUnitStateUpdated(%s): reporting state to Registry", jobName) eh.agent.ReportUnitState(jobName, state) }
func (eh *EventHandler) HandleCommandStopJob(ev event.Event) { jobName := ev.Payload.(string) target := ev.Context.(string) if target != eh.agent.Machine.State().ID { log.V(1).Infof("CommandStopJob(%s): scheduled elsewhere, ignoring", jobName) return } log.Infof("CommandStopJob(%s): instructing Agent to stop Job", jobName) eh.agent.StopJob(jobName) }