func (r *EtcdRegistry) getJobFromModel(jm jobModel) *job.Job { var err error var unit *unit.Unit // New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry if !jm.UnitHash.Empty() { unit = r.getUnitByHash(jm.UnitHash) if unit == nil { log.Warningf("No Unit found in Registry for Job(%s)", jm.Name) return nil } if unit.Hash() != jm.UnitHash { log.Errorf("Unit Hash %s does not match expected %s for Job(%s)!", unit.Hash(), jm.UnitHash, jm.Name) return nil } } else { // Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry unit, err = r.getUnitFromLegacyPayload(jm.Name) if err != nil { log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name) return nil } else if unit == nil { log.Warningf("No Payload found in Registry for Job(%s)", jm.Name) return nil } log.Infof("Migrating legacy Payload(%s)", jm.Name) if err := r.storeOrGetUnit(*unit); err != nil { log.Warningf("Unable to migrate legacy Payload: %v", err) } } return job.NewJob(jm.Name, *unit) }
// Reconcile drives the local Agent's state towards the desired state // stored in the Registry. func (ar *AgentReconciler) Reconcile(a *Agent) { ms := a.Machine.State() jobs, err := ar.reg.Jobs() if err != nil { log.Errorf("Failed fetching Jobs from Registry: %v", err) return } dAgentState, err := ar.desiredAgentState(jobs, &ms) if err != nil { log.Errorf("Unable to determine agent's desired state: %v", err) return } cAgentState, err := ar.currentAgentState(a) if err != nil { log.Errorf("Unable to determine agent's current state: %v", err) return } for t := range ar.calculateTasksForJobs(dAgentState, cAgentState) { err := ar.doTask(a, t) if err != nil { log.Errorf("Failed resolving task, halting reconciliation: task=%s err=%q", t, err) return } } }
// getUnitByHash retrieves from the Registry the Unit associated with the given Hash func (r *EtcdRegistry) getUnitByHash(hash unit.Hash) *unit.Unit { req := etcd.Get{ Key: r.hashedUnitPath(hash), Recursive: true, } resp, err := r.etcd.Do(&req) if err != nil { if isKeyNotFound(err) { err = nil } return nil } var um unitModel if err := unmarshal(resp.Node.Value, &um); err != nil { log.Errorf("error unmarshaling Unit(%s): %v", hash, err) return nil } u, err := unit.NewUnit(um.Raw) if err != nil { log.Errorf("error parsing Unit(%s): %v", hash, err) return nil } return u }
// SaveUnitState persists the given UnitState to the Registry func (r *EtcdRegistry) SaveUnitState(jobName string, unitState *unit.UnitState) { usm := unitStateToModel(unitState) if usm == nil { log.Errorf("Unable to save nil UnitState model") return } json, err := marshal(usm) if err != nil { log.Errorf("Error marshalling UnitState: %v", err) return } legacyKey := r.legacyUnitStatePath(jobName) req := etcd.Set{ Key: legacyKey, Value: json, } r.etcd.Do(&req) newKey := r.unitStatePath(unitState.MachineID, jobName) req = etcd.Set{ Key: newKey, Value: json, } r.etcd.Do(&req) }
// currentState generates a MachineState object with the values read from // the local system func (m *CoreOSMachine) currentState() *MachineState { id, err := readLocalMachineID("/") if err != nil { log.Errorf("Error retrieving machineID: %v\n", err) return nil } publicIP := getLocalIP() totalResources, err := readLocalResources() if err != nil { log.Errorf("Error retrieving local resources: %v\n", err) return nil } units, err := m.um.Units() if err != nil { log.Errorf("Error retrieving local units: %v\n", err) return nil } return &MachineState{ ID: id, PublicIP: publicIP, Metadata: make(map[string]string, 0), TotalResources: totalResources, LoadedUnits: len(units), } }
func (e *Engine) ResolveJobOffer(jobName string, machID string) error { log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName) mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock JobOffer(%s)", jobName) return errors.New("could not lock JobOffer") } defer mutex.Unlock() log.V(1).Infof("Claimed JobOffer(%s)", jobName) err := e.registry.ResolveJobOffer(jobName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err) return err } err = e.registry.ScheduleJob(jobName, machID) if err != nil { log.Errorf("Failed scheduling Job(%s): %v", jobName, err) return err } log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID) return nil }
// UpdateLoggingFlagsFromConfig extracts the logging-related options from // the provided config and sets flags in the given flagset func UpdateLoggingFlagsFromConfig(flagset *flag.FlagSet, conf *Config) { err := flagset.Lookup("v").Value.Set(strconv.Itoa(conf.Verbosity)) if err != nil { glog.Errorf("Failed to apply config.Verbosity to flag.v: %v", err) } err = flagset.Lookup("logtostderr").Value.Set("true") if err != nil { glog.Errorf("Failed to set flag.logtostderr to true: %v", err) } }
func (ur *unitsResource) destroy(rw http.ResponseWriter, req *http.Request, item string) { if validateContentType(req) != nil { sendError(rw, http.StatusNotAcceptable, errors.New("application/json is only supported Content-Type")) return } var du schema.DeletableUnit dec := json.NewDecoder(req.Body) err := dec.Decode(&du) if err != nil { sendError(rw, http.StatusBadRequest, fmt.Errorf("unable to decode body: %v", err)) return } var u *unit.Unit if len(du.FileContents) > 0 { u, err = decodeUnitContents(du.FileContents) if err != nil { sendError(rw, http.StatusBadRequest, fmt.Errorf("invalid fileContents: %v", err)) return } } j, err := ur.reg.Job(item) if err != nil { log.Errorf("Failed fetching Job(%s): %v", item, err) sendError(rw, http.StatusInternalServerError, nil) return } if j == nil { sendError(rw, http.StatusNotFound, errors.New("unit does not exist")) return } if u != nil && u.Hash() != j.Unit.Hash() { sendError(rw, http.StatusConflict, errors.New("hash of provided fileContents does not match that of existing unit")) return } err = ur.reg.DestroyJob(item) if err != nil { log.Errorf("Failed destroying Job(%s): %v", item, err) sendError(rw, http.StatusInternalServerError, nil) return } rw.WriteHeader(http.StatusNoContent) }
func (eh *EventHandler) handleLossOfMachineEvents(ev event.Event) { machID := ev.Payload.(string) mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID) if mutex == nil { log.V(1).Infof("%s(%s): failed to lock Machine, ignoring event", ev.Type, machID) return } defer mutex.Unlock() jobs := getJobsScheduledToMachine(eh.engine.registry, machID) for _, j := range jobs { log.Infof("%s(%s): clearing UnitState(%s)", ev.Type, machID, j.Name) err := eh.engine.registry.RemoveUnitState(j.Name) if err != nil { log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err) } log.Infof("%s(%s): unscheduling Job(%s)", ev.Type, machID, j.Name) eh.engine.registry.ClearJobTarget(j.Name, machID) } for _, j := range jobs { log.Infof("%s(%s): re-publishing JobOffer(%s)", ev.Type, machID, j.Name) eh.engine.OfferJob(j) } eh.engine.clust.machineRemoved(machID) }
func (mr *machinesResource) ServeHTTP(rw http.ResponseWriter, req *http.Request) { if req.Method != "GET" { sendError(rw, http.StatusBadRequest, fmt.Errorf("only HTTP GET supported against this resource")) return } token, err := findNextPageToken(req.URL) if err != nil { sendError(rw, http.StatusBadRequest, err) return } if token == nil { def := DefaultPageToken() token = &def } page, err := getMachinePage(mr.reg, *token) if err != nil { log.Errorf("Failed fetching page of Machines: %v", err) sendError(rw, http.StatusInternalServerError, nil) return } sendResponse(rw, http.StatusOK, page) }
func (m *SystemdUnitManager) stopUnit(name string) { if stat, err := m.systemd.StopUnit(name, "replace"); err != nil { log.Errorf("Failed to stop systemd unit %s: %v", name, err) } else { log.Infof("Stopped systemd unit %s(%s)", name, stat) } }
// bidForPossiblePeers submits bids for all known peers of the provided job that can // be run locally func (a *Agent) bidForPossiblePeers(jobName string) { peers := a.state.GetJobsByPeer(jobName) for _, peer := range peers { log.V(1).Infof("Found unresolved offer for Peer(%s) of Job(%s)", peer, jobName) peerJob, err := a.registry.Job(peer) if err != nil { log.Errorf("Failed fetching Job(%s) from Registry: %v", peer, err) return } if peerJob == nil { log.V(1).Infof("Unable to find Peer(%s) of Job(%s) in Registry", peer, jobName) return } if !a.ableToRun(peerJob) { log.V(1).Infof("Unable to run Peer(%s) of Job(%s), not bidding", peer, jobName) return } a.bid(peer) } }
// Purge removes the Agent's state from the Registry func (a *Agent) Purge() { // Continue heartbeating the agent's machine state while attempting to // stop all the locally-running jobs purged := make(chan bool) go a.heartbeatAgent(a.ttl, purged) a.state.Lock() scheduled := a.state.ScheduledJobs() a.state.Unlock() machID := a.Machine.State().ID for _, jobName := range scheduled { log.Infof("Unloading Job(%s) from local machine", jobName) a.unloadJob(jobName) log.Infof("Unscheduling Job(%s) from local machine", jobName) a.registry.ClearJobTarget(jobName, machID) } // Jobs have been stopped, the heartbeat can stop close(purged) log.Info("Removing Agent from Registry") if err := a.registry.RemoveMachineState(machID); err != nil { log.Errorf("Failed to remove Machine %s from Registry: %s", machID, err.Error()) } }
func (r *EtcdRegistry) determineJobState(jobName string) *job.JobState { state := job.JobStateInactive tgt, err := r.jobTargetMachine(jobName) if err != nil { log.Errorf("Unable to determine target of Job(%s): %v", jobName, err) return nil } if tgt == "" { return &state } if r.getUnitState(jobName) == nil { return &state } state = job.JobStateLoaded agent, pulse := r.CheckJobPulse(jobName) if !pulse || agent != tgt { return &state } state = job.JobStateLaunched return &state }
// sendResponse attempts to marshal an arbitrary thing to JSON then write // it to the http.ResponseWriter func sendResponse(rw http.ResponseWriter, code int, resp interface{}) { enc, err := json.Marshal(resp) if err != nil { log.Errorf("Failed JSON-encoding HTTP response: %v", err) rw.WriteHeader(http.StatusInternalServerError) return } rw.Header().Set("Content-Type", "application/json") rw.WriteHeader(code) _, err = rw.Write(enc) if err != nil { log.Errorf("Failed sending HTTP response body: %v", err) } }
func (r *Reconciler) Reconcile(e *Engine, stop chan struct{}) { log.V(1).Infof("Polling Registry for actionable work") clust, err := e.clusterState() if err != nil { log.Errorf("Failed getting current cluster state: %v", err) return } for t := range r.calculateClusterTasks(clust, stop) { err = doTask(t, e) if err != nil { log.Errorf("Failed resolving task: task=%s err=%v", t, err) } } }
func (e *Engine) OfferJob(j job.Job) error { log.V(1).Infof("Attempting to lock Job(%s)", j.Name) mutex := e.registry.LockJob(j.Name, e.machine.State().ID) if mutex == nil { log.V(1).Infof("Could not lock Job(%s)", j.Name) return errors.New("could not lock Job") } defer mutex.Unlock() log.V(1).Infof("Claimed Job(%s)", j.Name) machineIDs, err := e.partitionCluster(&j) if err != nil { log.Errorf("failed partitioning cluster for Job(%s): %v", j.Name, err) return err } offer := job.NewOfferFromJob(j, machineIDs) err = e.registry.CreateJobOffer(offer) if err == nil { log.Infof("Published JobOffer(%s)", offer.Job.Name) } return err }
func (ur *unitsResource) create(rw http.ResponseWriter, item string, ds job.JobState, u *unit.Unit) { j := job.NewJob(item, *u) if err := ur.reg.CreateJob(j); err != nil { log.Errorf("Failed creating Job(%s) in Registry: %v", j.Name, err) sendError(rw, http.StatusInternalServerError, nil) return } if err := ur.reg.SetJobTargetState(j.Name, ds); err != nil { log.Errorf("Failed setting target state of Job(%s): %v", j.Name, err) sendError(rw, http.StatusInternalServerError, nil) return } rw.WriteHeader(http.StatusNoContent) }
func (r *EtcdRegistry) getJobFromObjectNode(node *etcd.Node) (*job.Job, error) { var err error var jm jobModel if err = unmarshal(node.Value, &jm); err != nil { return nil, err } var unit *unit.Unit // New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry if !jm.UnitHash.Empty() { unit = r.getUnitByHash(jm.UnitHash) if unit == nil { log.Warningf("No Unit found in Registry for Job(%s)", jm.Name) return nil, nil } if unit.Hash() != jm.UnitHash { log.Errorf("Unit Hash %s does not match expected %s for Job(%s)!", unit.Hash(), jm.UnitHash, jm.Name) return nil, nil } } else { // Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry unit, err = r.getUnitFromLegacyPayload(jm.Name) if err != nil { log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name) return nil, nil } else if unit == nil { log.Warningf("No Payload found in Registry for Job(%s)", jm.Name) return nil, nil } log.Infof("Migrating legacy Payload(%s)", jm.Name) if err := r.storeOrGetUnit(*unit); err != nil { log.Warningf("Unable to migrate legacy Payload: %v", err) } jm.UnitHash = unit.Hash() log.Infof("Updating Job(%s) with legacy payload Hash(%s)", jm.Name, jm.UnitHash) if err := r.updateJobObjectNode(&jm, node.ModifiedIndex); err != nil { log.Warningf("Unable to update Job(%s) with legacy payload Hash(%s): %v", jm.Name, jm.UnitHash, err) } } return job.NewJob(jm.Name, *unit), nil }
func (e *Engine) unscheduleJob(jName, machID string) (err error) { err = e.registry.ClearJobTarget(jName, machID) if err != nil { log.Errorf("Failed clearing target Machine(%s) of Job(%s): %v", machID, jName, err) } else { log.Infof("Unscheduled Job(%s) from Machine(%s)", jName, machID) } return }
func (e *Engine) resolveJobOffer(jName string) (err error) { err = e.registry.ResolveJobOffer(jName) if err != nil { log.Errorf("Failed resolving JobOffer(%s): %v", jName, err) } else { log.Infof("Resolved JobOffer(%s)", jName) } return }
func (e *Engine) Purge() { if e.lease == nil { return } err := e.lease.Release() if err != nil { log.Errorf("Failed to release lease: %v", err) } }
// Monitor tracks the health of the Server. If the Server is ever deemed // unhealthy, the Server is restarted. func (s *Server) Monitor() { err := s.mon.Monitor(s.hrt, s.stop) if err != nil { log.Errorf("Server monitor triggered: %v", err) s.Stop() s.Run() } }
func (e *Engine) offerJob(j *job.Job) (err error) { offer := job.NewOfferFromJob(*j) err = e.registry.CreateJobOffer(offer) if err != nil { log.Errorf("Failed publishing JobOffer(%s): %v", j.Name, err) } else { log.Infof("Published JobOffer(%s)", j.Name) } return }
func (r *dumbReconciler) Reconcile(e *Engine) { log.V(1).Infof("Polling Registry for actionable work") clust, err := e.clusterState() if err != nil { log.Errorf("Failed getting current cluster state: %v", err) return } taskchan := make(chan *task) go calculateClusterTasks(taskchan, clust) for t := range taskchan { err = doTask(t, e) if err != nil { log.Errorf("Failed resolving task: task=%s err=%v", t, err) } } }
// loadJob hands the given Job to systemd without acquiring the // state mutex. The caller is responsible for acquiring it. func (a *Agent) loadJob(j *job.Job) { log.Infof("Loading Job(%s)", j.Name) a.state.SetTargetState(j.Name, job.JobStateLoaded) err := a.um.Load(j.Name, j.Unit) if err != nil { log.Errorf("Failed loading Job(%s): %v", j.Name, err) return } // We must explicitly refresh the payload state, as the dbus // event listener does not send an event when we write a unit // file to disk. us, err := a.um.GetUnitState(j.Name) if err != nil { log.Errorf("Failed fetching state of Unit(%s): %v", j.Name, err) return } a.ReportUnitState(j.Name, us) }
// checkVersion makes a best-effort attempt to verify that fleetctl is at least as new as the // latest fleet version found registered in the cluster. If any errors are encountered or fleetctl // is >= the latest version found, it returns true. If it is < the latest found version, it returns // false and a scary warning to the user. func checkVersion() (string, bool) { fv := version.SemVersion lv, err := cAPI.LatestVersion() if err != nil { log.Errorf("error attempting to check latest fleet version in Registry: %v", err) } else if lv != nil && fv.LessThan(*lv) { return fmt.Sprintf(oldVersionWarning, fv.String(), lv.String()), false } return "", true }
func (s *Server) Serve() { for i, _ := range s.listeners { l := s.listeners[i] go func() { err := http.Serve(l, s) if err != nil { log.Errorf("Failed serving HTTP on listener: %s", l.Addr) } }() } }
// Purge attempts to unload all Jobs that have been loaded locally func (ar *AgentReconciler) Purge(a *Agent) { cAgentState, err := currentAgentState(a) if err != nil { log.Errorf("Unable to determine agent's current state: %v", err) return } for _, cJob := range cAgentState.jobs { t := task{ Type: taskTypeUnloadJob, Job: cJob, Reason: taskReasonPurgingAgent, } err := ar.doTask(a, &t) if err != nil { log.Errorf("Failed resolving task: task=%s err=%q", t, err) } } }
func (j *Job) resourceFromKey(resKey string) int { valStr, ok := j.Requirements()[resKey] if ok && len(valStr) > 0 { val, err := strconv.Atoi(valStr[0]) if err != nil { log.Errorf("failed to parse resource requirement %s from %s: %v", resKey, j.Name, err) return 0 } return val } return 0 }