func (self *Engine) ResolveJobOffer(jobName string, machBootId string) error { log.V(2).Infof("Attempting to lock JobOffer(%s)", jobName) mutex := self.lockJobOffer(jobName) if mutex == nil { log.V(2).Infof("Could not lock JobOffer(%s)", jobName) return errors.New("Could not lock JobOffer") } defer mutex.Unlock() log.V(2).Infof("Claimed JobOffer(%s)", jobName) log.V(2).Infof("Resolving JobOffer(%s), scheduling to Machine(%s)", jobName, machBootId) err := self.registry.ResolveJobOffer(jobName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err) return err } err = self.registry.ScheduleJob(jobName, machBootId) if err != nil { log.Errorf("Failed scheduling Job(%s): %v", jobName, err) return err } log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machBootId) return nil }
func (e *Engine) ResolveJobOffer(jobName string, machID string) error { log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName) mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock JobOffer(%s)", jobName) return errors.New("Could not lock JobOffer") } defer mutex.Unlock() log.V(1).Infof("Claimed JobOffer(%s)", jobName) err := e.registry.ResolveJobOffer(jobName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err) return err } err = e.registry.ScheduleJob(jobName, machID) if err != nil { log.Errorf("Failed scheduling Job(%s): %v", jobName, err) return err } log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID) return nil }
func UpdateLoggingFlagsFromConfig(conf *Config) { err := flag.Lookup("v").Value.Set(strconv.Itoa(conf.Verbosity)) if err != nil { glog.Errorf("Failed to apply config.Verbosity to flag.v: %v", err) } err = flag.Lookup("logtostderr").Value.Set("true") if err != nil { glog.Errorf("Failed to set flag.logtostderr to true: %v", err) } }
func (eh *EventHandler) HandleEventJobScheduled(ev event.Event) { jobName := ev.Payload.(string) target := ev.Context.(string) eh.agent.state.Lock() defer eh.agent.state.Unlock() log.V(1).Infof("EventJobScheduled(%s): Dropping outstanding offers and bids", jobName) eh.agent.state.PurgeOffer(jobName) if target != eh.agent.Machine().State().ID { log.Infof("EventJobScheduled(%s): Job not scheduled to this Agent, purging related data from cache", jobName) eh.agent.state.PurgeJob(jobName) log.Infof("EventJobScheduled(%s): Checking outstanding job offers", jobName) eh.agent.BidForPossibleJobs() return } log.Infof("EventJobScheduled(%s): Job scheduled to this Agent", jobName) j := eh.agent.FetchJob(jobName) if j == nil { log.Errorf("EventJobScheduled(%s): Failed to fetch Job", jobName) return } if !eh.agent.VerifyJob(j) { log.Errorf("EventJobScheduled(%s): Failed to verify Job", j.Name) return } if !eh.agent.AbleToRun(j) { log.Infof("EventJobScheduled(%s): Unable to run scheduled Job, unscheduling.", jobName) eh.agent.registry.ClearJobTarget(jobName, target) eh.agent.state.PurgeJob(jobName) return } log.Infof("EventJobScheduled(%s): Loading Job", j.Name) eh.agent.LoadJob(j) log.Infof("EventJobScheduled(%s): Bidding for all possible peers of Job", j.Name) eh.agent.BidForPossiblePeers(j.Name) ts := eh.agent.registry.GetJobTargetState(j.Name) if ts == nil || *ts != job.JobStateLaunched { return } log.Infof("EventJobScheduled(%s): Starting Job", j.Name) eh.agent.StartJob(j.Name) }
// Periodically report to the Registry at an interval equal to // half of the provided ttl. Stop reporting when the provided // channel is closed. Failed attempts to report state to the // Registry are retried twice before moving on to the next // reporting interval. func (a *Agent) Heartbeat(ttl time.Duration, stop chan bool) { attempt := func(attempts int, f func() error) (err error) { if attempts < 1 { return fmt.Errorf("attempts argument must be 1 or greater, got %d", attempts) } // The amount of time the retry mechanism waits after a failed attempt // doubles following each failure. This is a simple exponential backoff. sleep := time.Second for i := 1; i <= attempts; i++ { err = f() if err == nil || i == attempts { break } sleep = sleep * 2 log.V(2).Infof("function returned err, retrying in %v: %v", sleep, err) time.Sleep(sleep) } return err } heartbeat := func() error { return a.registry.SetMachineState(a.machine.State(), ttl) } // Explicitly heartbeat immediately to push state to the // Registry as quickly as possible a.machine.RefreshState() if err := attempt(3, heartbeat); err != nil { log.Errorf("Failed heartbeat after 3 attempts: %v", err) } interval := ttl / refreshInterval ticker := time.Tick(interval) for { select { case <-stop: log.V(2).Info("MachineHeartbeat exiting due to stop signal") return case <-ticker: log.V(2).Info("MachineHeartbeat tick") a.machine.RefreshState() if err := attempt(3, heartbeat); err != nil { log.Errorf("Failed heartbeat after 3 attempts: %v", err) } } } }
func (eh *EventHandler) HandleEventJobScheduled(ev event.Event) { jobName := ev.Payload.(string) log.V(1).Infof("EventJobScheduled(%s): Dropping outstanding offers and bids", jobName) eh.agent.OfferResolved(jobName) if ev.Context.(machine.MachineState).BootId != eh.agent.Machine().State().BootId { log.V(1).Infof("EventJobScheduled(%s): Job not scheduled to this Agent, checking unbade offers", jobName) eh.agent.BidForPossibleJobs() return } log.V(1).Infof("EventJobScheduled(%s): Job scheduled to this Agent", jobName) j := eh.agent.FetchJob(jobName) if j == nil { log.Errorf("EventJobScheduled(%s): Failed to fetch Job") return } if !eh.agent.AbleToRun(j) { log.V(1).Infof("EventJobScheduled(%s): Unable to run scheduled Job, rescheduling.", jobName) eh.agent.RescheduleJob(j) return } log.V(1).Infof("EventJobScheduled(%s): Starting Job", j.Name) eh.agent.StartJob(j) log.V(1).Infof("EventJobScheduled(%s): Bidding for all possible peers of Job", j.Name) eh.agent.BidForPossiblePeers(j.Name) }
func (e *Engine) OfferJob(j job.Job) error { log.V(1).Infof("Attempting to lock Job(%s)", j.Name) mutex := e.registry.LockJob(j.Name, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock Job(%s)", j.Name) return errors.New("Could not lock Job") } defer mutex.Unlock() log.V(1).Infof("Claimed Job(%s)", j.Name) machineIDs, err := e.partitionCluster(&j) if err != nil { log.Errorf("Failed partitioning cluster for Job(%s): %v", j.Name, err) return err } offer := job.NewOfferFromJob(j, machineIDs) err = e.registry.CreateJobOffer(offer) if err == nil { log.Infof("Published JobOffer(%s)", offer.Job.Name) } return err }
// Publish is a long-running function that streams dbus events through // a translation layer and on to the EventBus func (m *SystemdManager) Publish(bus *event.EventBus, stopchan chan bool) { m.Systemd.Subscribe() changechan, errchan := m.subscriptions.Subscribe() stream := NewEventStream() stream.Stream(changechan, bus.Channel) for true { select { case <-stopchan: break case err := <-errchan: var errString string if err != nil { errString = err.Error() } else { errString = "N/A" } log.Errorf("Received error from dbus: err=%s", errString) } } stream.Close() m.Systemd.Unsubscribe() }
// JobScheduledLocally clears all state related to the indicated // job's offers/bids before attempting to load and possibly start // the job. The ability to run the job will be revalidated before // loading, and unscheduled if such validation fails. func (a *Agent) JobScheduledLocally(jobName string) { a.state.Lock() defer a.state.Unlock() log.Infof("Dropping offer and bid for Job(%s) from cache", jobName) a.state.PurgeOffer(jobName) j := a.fetchJob(jobName) if j == nil { log.Errorf("Failed to fetch Job(%s)", jobName) return } if !a.ableToRun(j) { log.Infof("Unable to run locally-scheduled Job(%s), unscheduling", jobName) a.registry.ClearJobTarget(jobName, a.Machine.State().ID) a.state.PurgeJob(jobName) return } a.loadJob(j) log.Infof("Bidding for all possible peers of Job(%s)", j.Name) a.bidForPossiblePeers(j.Name) ts, _ := a.registry.GetJobTargetState(j.Name) if ts == nil || *ts != job.JobStateLaunched { return } log.Infof("Job(%s) loaded, now starting it", j.Name) a.startJobUnlocked(j.Name) }
func (m *SystemdManager) stopUnit(name string) { if stat, err := m.Systemd.StopUnit(name, "replace"); err != nil { log.Errorf("Failed to stop systemd unit %s: %v", name, err) } else { log.Infof("Stopped systemd unit %s(%s)", name, stat) } }
func (r *Registry) GetAllPayloads() []job.JobPayload { var payloads []job.JobPayload key := path.Join(keyPrefix, payloadPrefix) resp, err := r.etcd.Get(key, true, true) if err != nil { return payloads } for _, node := range resp.Node.Nodes { var jp job.JobPayload //TODO: Handle the error generated by unmarshal unmarshal(node.Value, &jp) if err != nil { log.Errorf(err.Error()) continue } payloads = append(payloads, jp) } return payloads }
func (self *Engine) OfferJob(j job.Job) error { log.V(2).Infof("Attempting to lock Job(%s)", j.Name) mutex := self.lockJob(j.Name) if mutex == nil { log.V(1).Infof("Could not lock Job(%s)", j.Name) return errors.New("Could not lock Job") } defer mutex.Unlock() log.V(1).Infof("Claimed Job", j.Name) machineBootIds, err := self.partitionCluster(&j) if err != nil { log.Errorf("Failed partitioning cluster for Job(%s): %v", j.Name, err) return err } offer := job.NewOfferFromJob(j, machineBootIds) log.V(2).Infof("Publishing JobOffer(%s)", offer.Job.Name) self.registry.CreateJobOffer(offer) log.Infof("Published JobOffer(%s)", offer.Job.Name) return nil }
// Purge removes the Agent's state from the Registry func (a *Agent) Purge() { // Continue heartbeating the agent's machine state while attempting to // stop all the locally-running jobs purged := make(chan bool) go a.heartbeatAgent(a.ttl, purged) a.state.Lock() scheduled := a.state.ScheduledJobs() a.state.Unlock() machID := a.Machine.State().ID for _, jobName := range scheduled { log.Infof("Unloading Job(%s) from local machine", jobName) a.unloadJob(jobName) log.Infof("Unscheduling Job(%s) from local machine", jobName) a.registry.ClearJobTarget(jobName, machID) } // Jobs have been stopped, the heartbeat can stop close(purged) log.Info("Removing Agent from Registry") if err := a.registry.RemoveMachineState(machID); err != nil { log.Errorf("Failed to remove Machine %s from Registry: %s", machID, err.Error()) } }
func (r *Registry) UnresolvedJobOffers() []job.JobOffer { var offers []job.JobOffer key := path.Join(keyPrefix, offerPrefix) resp, err := r.etcd.Get(key, true, true) if err != nil { return offers } for _, node := range resp.Node.Nodes { key := path.Join(node.Key, "object") resp, err := r.etcd.Get(key, true, true) // The object was probably handled between when we attempted to // start resolving offers and when we actually tried to get it if err != nil { continue } var jo job.JobOffer err = unmarshal(resp.Node.Value, &jo) if err != nil { log.Errorf(err.Error()) continue } offers = append(offers, jo) } return offers }
func (eh *EventHandler) HandleEventMachineRemoved(ev event.Event) { machID := ev.Payload.(string) mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID) if mutex == nil { log.V(1).Infof("EventMachineRemoved(%s): failed to lock Machine, ignoring event", machID) return } defer mutex.Unlock() jobs := getJobsScheduledToMachine(eh.engine.registry, machID) for _, j := range jobs { log.Infof("EventMachineRemoved(%s): clearing UnitState(%s)", machID, j.Name) err := eh.engine.registry.RemoveUnitState(j.Name) if err != nil { log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err) } log.Infof("EventMachineRemoved(%s): unscheduling Job(%s)", machID, j.Name) eh.engine.registry.ClearJobTarget(j.Name, machID) } for _, j := range jobs { log.Infof("EventMachineRemoved(%s): re-publishing JobOffer(%s)", machID, j.Name) eh.engine.OfferJob(j) } eh.engine.clust.machineRemoved(machID) }
func (m *SystemdManager) startUnit(name string) { log.V(1).Infof("Starting systemd unit %s", name) files := []string{name} if ok, _, err := m.Systemd.EnableUnitFiles(files, true, false); !ok { log.Errorf("Failed to enable systemd unit %s: %v", name, err) return } else { log.V(1).Infof("Enabled systemd unit %s", name) } if stat, err := m.Systemd.StartUnit(name, "replace"); err != nil { log.Errorf("Failed to start systemd unit %s: %v", name, err) } else { log.Infof("Started systemd unit %s(%s)", name, stat) } }
// UpdateLoggingFlagsFromConfig extracts the logging-related options from // the provided config and sets flags in the given flagset func UpdateLoggingFlagsFromConfig(flagset *flag.FlagSet, conf *Config) { err := flagset.Lookup("v").Value.Set(strconv.Itoa(conf.Verbosity)) if err != nil { glog.Errorf("Failed to apply config.Verbosity to flag.v: %v", err) } err = flagset.Lookup("logtostderr").Value.Set("true") if err != nil { glog.Errorf("Failed to set flag.logtostderr to true: %v", err) } if conf.Verbosity > 2 { etcd.EnableDebugLogging() } else { etcd.DisableDebugLogging() } }
// UpdateLoggingFlagsFromConfig extracts the logging-related options from // the provided config and sets flags in the given flagset func UpdateLoggingFlagsFromConfig(flagset *flag.FlagSet, conf *Config) { err := flagset.Lookup("v").Value.Set(strconv.Itoa(conf.Verbosity)) if err != nil { glog.Errorf("Failed to apply config.Verbosity to flag.v: %v", err) } err = flagset.Lookup("logtostderr").Value.Set("true") if err != nil { glog.Errorf("Failed to set flag.logtostderr to true: %v", err) } if conf.Verbosity > 2 { etcd.SetLogger(log.New(os.Stdout, "go-etcd", log.LstdFlags)) } else { etcd.SetLogger(log.New(ioutil.Discard, "go-etcd", log.LstdFlags)) } }
func (r *Registry) CreateJobOffer(jo *job.JobOffer) { key := path.Join(keyPrefix, offerPrefix, jo.Job.Name, "object") json, err := marshal(jo) if err != nil { log.Errorf(err.Error()) return } r.etcd.Set(key, json, 0) }
// checkVersion makes a best-effort attempt to verify that fleetctl is at least as new as the // latest fleet version found registered in the cluster. If any errors are encountered or fleetctl // is >= the latest version found, it returns true. If it is < the latest found version, it returns // false and a scary warning to the user. func checkVersion() (string, bool) { fv := version.SemVersion lv, err := registryCtl.GetLatestVersion() if err != nil { log.Errorf("error attempting to check latest fleet version in Registry: %v", err) } else if lv != nil && fv.LessThan(*lv) { return fmt.Sprintf(oldVersionWarning, fv.String(), lv.String()), false } return "", true }
// loadJob hands the given Job to systemd without acquiring the // state mutex. The caller is responsible for acquiring it. func (a *Agent) loadJob(j *job.Job) { log.Infof("Loading Job(%s)", j.Name) a.state.SetTargetState(j.Name, job.JobStateLoaded) err := a.um.Load(j.Name, j.Unit) if err != nil { log.Errorf("Failed loading Job(%s): %v", j.Name, err) return } // We must explicitly refresh the payload state, as the dbus // event listener does not send an event when we write a unit // file to disk. us, err := a.um.GetUnitState(j.Name) if err != nil { log.Errorf("Failed fetching state of Unit(%s): %v", j.Name, err) return } a.ReportUnitState(j.Name, us) }
// Persist the state of the given Job into the Registry func (a *Agent) ReportUnitState(jobName string, us *unit.UnitState) { if us == nil { err := a.registry.RemoveUnitState(jobName) if err != nil { log.Errorf("Failed to remove UnitState for job %s from Registry: %s", jobName, err.Error()) } } else { a.registry.SaveUnitState(jobName, us) } }
func (r *EtcdRegistry) getJobFromModel(jm jobModel) *job.Job { var err error var unit *unit.Unit // New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry if !jm.UnitHash.Empty() { unit = r.getUnitByHash(jm.UnitHash) if unit == nil { log.Warningf("No Unit found in Registry for Job(%s)", jm.Name) return nil } if unit.Hash() != jm.UnitHash { log.Errorf("Unit Hash %s does not match expected %s for Job(%s)!", unit.Hash(), jm.UnitHash, jm.Name) return nil } log.V(2).Infof("Got Unit for Job(%s) from registry", jm.Name) } else { // Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry log.V(2).Infof("Legacy Job(%s) has no PayloadHash - looking for associated Payload", jm.Name) unit, err = r.getUnitFromLegacyPayload(jm.Name) if err != nil { log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name) return nil } else if unit == nil { log.Warningf("No Payload found in Registry for Job(%s)", jm.Name) return nil } log.Infof("Migrating legacy Payload(%s)", jm.Name) if err := r.storeOrGetUnit(*unit); err != nil { log.Warningf("Unable to migrate legacy Payload: %v", err) } } j := job.NewJob(jm.Name, *unit) j.UnitState = r.getUnitState(jm.Name) j.State = r.determineJobState(jm.Name) return j }
func (r *EtcdRegistry) GetJobTargetState(jobName string) (*job.JobState, error) { key := r.jobTargetStatePath(jobName) resp, err := r.etcd.Get(key, false, false) if err != nil { if err.(*goetcd.EtcdError).ErrorCode != etcd.EcodeNodeExist { log.Errorf("Unable to determine target-state of Job(%s): %v", jobName, err) } return nil, err } return job.ParseJobState(resp.Node.Value), nil }
func (j *Job) resourceFromKey(resKey string) int { valStr, ok := j.Requirements()[resKey] if ok && len(valStr) > 0 { val, err := strconv.Atoi(valStr[0]) if err != nil { log.Errorf("failed to parse resource requirement %s from %s: %v", resKey, j.Name, err) return 0 } return val } return 0 }
// getUnitByHash retrieves from the Registry the Unit associated with the given Hash func (r *EtcdRegistry) getUnitByHash(hash unit.Hash) *unit.Unit { key := r.hashedUnitPath(hash) resp, err := r.etcd.Get(key, false, true) if err != nil { return nil } var u unit.Unit if err := unmarshal(resp.Node.Value, &u); err != nil { log.Errorf("Error unmarshaling Unit(%s): %v", hash, err) return nil } return &u }
func (m *SystemdManager) stopUnit(name string) { log.V(1).Infof("Stopping systemd unit %s", name) if stat, err := m.Systemd.StopUnit(name, "replace"); err != nil { log.Errorf("Failed to stop systemd unit %s: %v", name, err) } else { log.Infof("Stopped systemd unit %s(%s)", name, stat) } // go-systemd does not yet have this implemented //files := []string{name} //Systemd.DisableUnitFiles(files, true, false) }
// Check is called during the handshake to check the server's public key for // unexpected changes. The key argument is in SSH wire format. It can be parsed // using ssh.ParsePublicKey. The address before DNS resolution is passed in the // addr argument, so the key can also be checked against the hostname. // It returns any error encountered while checking the public key. A nil return // value indicates that the key was either successfully verified (against an // existing known_hosts entry), or accepted by the user as a new key. func (kc *HostKeyChecker) Check(addr string, remote net.Addr, key gossh.PublicKey) error { remoteAddr, err := kc.addrToHostPort(remote.String()) if err != nil { return err } algoStr := algoString(key.Type()) keyFingerprintStr := md5String(md5.Sum(key.Marshal())) hostKeys, err := kc.m.GetHostKeys() _, ok := err.(*os.PathError) if err != nil && !ok { log.Errorf("Failed to read known_hosts file %v: %v", kc.m.String(), err) } mismatched := false for pattern, keys := range hostKeys { if !matchHost(remoteAddr, pattern) { continue } for _, hostKey := range keys { // Any matching key is considered a success, irrespective of previous failures if hostKey.Type() == key.Type() && bytes.Compare(hostKey.Marshal(), key.Marshal()) == 0 { return nil } // TODO(jonboulle): could be super friendly like the OpenSSH client // and note exactly which key failed (file + line number) mismatched = true } } if mismatched { fmt.Fprintf(os.Stderr, warningRemoteHostChanged, algoStr, keyFingerprintStr, kc.m.String()) return ErrUnmatchKey } // If we get this far, we haven't matched on any of the hostname patterns, // so it's considered a new key. Prompt the user to trust it. if !kc.trustHost(remoteAddr, algoStr, keyFingerprintStr) { fmt.Fprintln(os.Stderr, "Host key verification failed.") return ErrUntrustHost } if err := kc.m.PutHostKey(remoteAddr, key); err != nil { fmt.Fprintf(os.Stderr, "Failed to add the host to the list of known hosts (%v).\n", kc.m) return nil } fmt.Fprintf(os.Stderr, "Warning: Permanently added '%v' (%v) to the list of known hosts.\n", remoteAddr, algoStr) return nil }
// ReportUnitState attaches the current state of the Agent's Machine to the given // unit.UnitState object, then persists that state in the Registry func (a *Agent) ReportUnitState(jobName string, us *unit.UnitState) { if us == nil { log.V(1).Infof("Job(%s): purging UnitState from Registry", jobName) err := a.registry.RemoveUnitState(jobName) if err != nil { log.Errorf("Failed to remove UnitState for job %s from Registry: %s", jobName, err.Error()) } } else { ms := a.Machine.State() us.MachineState = &ms log.V(1).Infof("Job(%s): pushing UnitState (loadState=%s, activeState=%s, subState=%s) to Registry", jobName, us.LoadState, us.ActiveState, us.SubState) a.registry.SaveUnitState(jobName, us) } }
// stopJobUnlocked stops the indicated Job without acquiring the state // mutex. The caller is responsible for acquiring it. func (a *Agent) stopJobUnlocked(jobName string) { a.state.SetTargetState(jobName, job.JobStateLoaded) a.registry.ClearJobHeartbeat(jobName) go func() { a.um.Stop(jobName) // We must explicitly refresh the payload state, as the dbus // event listener sends a nil event when a unit deactivates. us, err := a.um.GetUnitState(jobName) if err != nil { log.Errorf("Failed fetching state of Unit(%s): %v", jobName, err) return } a.ReportUnitState(jobName, us) }() }