Beispiel #1
0
func (r *EtcdRegistry) getJobFromModel(jm jobModel) *job.Job {
	var err error
	var unit *unit.Unit

	// New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry
	if !jm.UnitHash.Empty() {
		unit = r.getUnitByHash(jm.UnitHash)
		if unit == nil {
			log.Warningf("No Unit found in Registry for Job(%s)", jm.Name)
			return nil
		}
		if unit.Hash() != jm.UnitHash {
			log.Errorf("Unit Hash %s does not match expected %s for Job(%s)!", unit.Hash(), jm.UnitHash, jm.Name)
			return nil
		}
	} else {
		// Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry
		unit, err = r.getUnitFromLegacyPayload(jm.Name)
		if err != nil {
			log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name)
			return nil
		} else if unit == nil {
			log.Warningf("No Payload found in Registry for Job(%s)", jm.Name)
			return nil
		}

		log.Infof("Migrating legacy Payload(%s)", jm.Name)
		if err := r.storeOrGetUnit(*unit); err != nil {
			log.Warningf("Unable to migrate legacy Payload: %v", err)
		}
	}

	return job.NewJob(jm.Name, *unit)
}
Beispiel #2
0
// Reconcile drives the local Agent's state towards the desired state
// stored in the Registry.
func (ar *AgentReconciler) Reconcile(a *Agent) {
	ms := a.Machine.State()

	jobs, err := ar.reg.Jobs()
	if err != nil {
		log.Errorf("Failed fetching Jobs from Registry: %v", err)
		return
	}

	dAgentState, err := ar.desiredAgentState(jobs, &ms)
	if err != nil {
		log.Errorf("Unable to determine agent's desired state: %v", err)
		return
	}

	cAgentState, err := ar.currentAgentState(a)
	if err != nil {
		log.Errorf("Unable to determine agent's current state: %v", err)
		return
	}

	for t := range ar.calculateTasksForJobs(dAgentState, cAgentState) {
		err := ar.doTask(a, t)
		if err != nil {
			log.Errorf("Failed resolving task, halting reconciliation: task=%s err=%q", t, err)
			return
		}
	}
}
Beispiel #3
0
// getUnitByHash retrieves from the Registry the Unit associated with the given Hash
func (r *EtcdRegistry) getUnitByHash(hash unit.Hash) *unit.Unit {
	req := etcd.Get{
		Key:       r.hashedUnitPath(hash),
		Recursive: true,
	}
	resp, err := r.etcd.Do(&req)
	if err != nil {
		if isKeyNotFound(err) {
			err = nil
		}
		return nil
	}
	var um unitModel
	if err := unmarshal(resp.Node.Value, &um); err != nil {
		log.Errorf("error unmarshaling Unit(%s): %v", hash, err)
		return nil
	}

	u, err := unit.NewUnit(um.Raw)
	if err != nil {
		log.Errorf("error parsing Unit(%s): %v", hash, err)
		return nil
	}

	return u
}
Beispiel #4
0
// SaveUnitState persists the given UnitState to the Registry
func (r *EtcdRegistry) SaveUnitState(jobName string, unitState *unit.UnitState) {
	usm := unitStateToModel(unitState)
	if usm == nil {
		log.Errorf("Unable to save nil UnitState model")
		return
	}

	json, err := marshal(usm)
	if err != nil {
		log.Errorf("Error marshalling UnitState: %v", err)
		return
	}

	legacyKey := r.legacyUnitStatePath(jobName)
	req := etcd.Set{
		Key:   legacyKey,
		Value: json,
	}
	r.etcd.Do(&req)

	newKey := r.unitStatePath(unitState.MachineID, jobName)
	req = etcd.Set{
		Key:   newKey,
		Value: json,
	}
	r.etcd.Do(&req)
}
Beispiel #5
0
// currentState generates a MachineState object with the values read from
// the local system
func (m *CoreOSMachine) currentState() *MachineState {
	id, err := readLocalMachineID("/")
	if err != nil {
		log.Errorf("Error retrieving machineID: %v\n", err)
		return nil
	}
	publicIP := getLocalIP()
	totalResources, err := readLocalResources()
	if err != nil {
		log.Errorf("Error retrieving local resources: %v\n", err)
		return nil
	}
	units, err := m.um.Units()
	if err != nil {
		log.Errorf("Error retrieving local units: %v\n", err)
		return nil
	}
	return &MachineState{
		ID:             id,
		PublicIP:       publicIP,
		Metadata:       make(map[string]string, 0),
		TotalResources: totalResources,
		LoadedUnits:    len(units),
	}
}
Beispiel #6
0
func (e *Engine) ResolveJobOffer(jobName string, machID string) error {
	log.V(1).Infof("Attempting to lock JobOffer(%s)", jobName)
	mutex := e.registry.LockJobOffer(jobName, e.machine.State().ID)

	if mutex == nil {
		log.V(1).Infof("Could not lock JobOffer(%s)", jobName)
		return errors.New("could not lock JobOffer")
	}
	defer mutex.Unlock()

	log.V(1).Infof("Claimed JobOffer(%s)", jobName)

	err := e.registry.ResolveJobOffer(jobName)
	if err != nil {
		log.Errorf("Failed resolving JobOffer(%s): %v", jobName, err)
		return err
	}

	err = e.registry.ScheduleJob(jobName, machID)
	if err != nil {
		log.Errorf("Failed scheduling Job(%s): %v", jobName, err)
		return err
	}

	log.Infof("Scheduled Job(%s) to Machine(%s)", jobName, machID)
	return nil
}
Beispiel #7
0
// UpdateLoggingFlagsFromConfig extracts the logging-related options from
// the provided config and sets flags in the given flagset
func UpdateLoggingFlagsFromConfig(flagset *flag.FlagSet, conf *Config) {
	err := flagset.Lookup("v").Value.Set(strconv.Itoa(conf.Verbosity))
	if err != nil {
		glog.Errorf("Failed to apply config.Verbosity to flag.v: %v", err)
	}

	err = flagset.Lookup("logtostderr").Value.Set("true")
	if err != nil {
		glog.Errorf("Failed to set flag.logtostderr to true: %v", err)
	}
}
Beispiel #8
0
func (ur *unitsResource) destroy(rw http.ResponseWriter, req *http.Request, item string) {
	if validateContentType(req) != nil {
		sendError(rw, http.StatusNotAcceptable, errors.New("application/json is only supported Content-Type"))
		return
	}

	var du schema.DeletableUnit
	dec := json.NewDecoder(req.Body)
	err := dec.Decode(&du)
	if err != nil {
		sendError(rw, http.StatusBadRequest, fmt.Errorf("unable to decode body: %v", err))
		return
	}

	var u *unit.Unit
	if len(du.FileContents) > 0 {
		u, err = decodeUnitContents(du.FileContents)
		if err != nil {
			sendError(rw, http.StatusBadRequest, fmt.Errorf("invalid fileContents: %v", err))
			return
		}
	}

	j, err := ur.reg.Job(item)
	if err != nil {
		log.Errorf("Failed fetching Job(%s): %v", item, err)
		sendError(rw, http.StatusInternalServerError, nil)
		return
	}

	if j == nil {
		sendError(rw, http.StatusNotFound, errors.New("unit does not exist"))
		return
	}

	if u != nil && u.Hash() != j.Unit.Hash() {
		sendError(rw, http.StatusConflict, errors.New("hash of provided fileContents does not match that of existing unit"))
		return
	}

	err = ur.reg.DestroyJob(item)
	if err != nil {
		log.Errorf("Failed destroying Job(%s): %v", item, err)
		sendError(rw, http.StatusInternalServerError, nil)
		return
	}

	rw.WriteHeader(http.StatusNoContent)
}
Beispiel #9
0
func (eh *EventHandler) handleLossOfMachineEvents(ev event.Event) {
	machID := ev.Payload.(string)
	mutex := eh.engine.registry.LockMachine(machID, eh.engine.machine.State().ID)
	if mutex == nil {
		log.V(1).Infof("%s(%s): failed to lock Machine, ignoring event", ev.Type, machID)
		return
	}
	defer mutex.Unlock()

	jobs := getJobsScheduledToMachine(eh.engine.registry, machID)

	for _, j := range jobs {
		log.Infof("%s(%s): clearing UnitState(%s)", ev.Type, machID, j.Name)
		err := eh.engine.registry.RemoveUnitState(j.Name)
		if err != nil {
			log.Errorf("Failed removing UnitState(%s) from Registry: %v", j.Name, err)
		}

		log.Infof("%s(%s): unscheduling Job(%s)", ev.Type, machID, j.Name)
		eh.engine.registry.ClearJobTarget(j.Name, machID)
	}

	for _, j := range jobs {
		log.Infof("%s(%s): re-publishing JobOffer(%s)", ev.Type, machID, j.Name)
		eh.engine.OfferJob(j)
	}
	eh.engine.clust.machineRemoved(machID)
}
Beispiel #10
0
func (mr *machinesResource) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
	if req.Method != "GET" {
		sendError(rw, http.StatusBadRequest, fmt.Errorf("only HTTP GET supported against this resource"))
		return
	}

	token, err := findNextPageToken(req.URL)
	if err != nil {
		sendError(rw, http.StatusBadRequest, err)
		return
	}

	if token == nil {
		def := DefaultPageToken()
		token = &def
	}

	page, err := getMachinePage(mr.reg, *token)
	if err != nil {
		log.Errorf("Failed fetching page of Machines: %v", err)
		sendError(rw, http.StatusInternalServerError, nil)
		return
	}

	sendResponse(rw, http.StatusOK, page)
}
Beispiel #11
0
func (m *SystemdUnitManager) stopUnit(name string) {
	if stat, err := m.systemd.StopUnit(name, "replace"); err != nil {
		log.Errorf("Failed to stop systemd unit %s: %v", name, err)
	} else {
		log.Infof("Stopped systemd unit %s(%s)", name, stat)
	}
}
Beispiel #12
0
// bidForPossiblePeers submits bids for all known peers of the provided job that can
// be run locally
func (a *Agent) bidForPossiblePeers(jobName string) {
	peers := a.state.GetJobsByPeer(jobName)

	for _, peer := range peers {
		log.V(1).Infof("Found unresolved offer for Peer(%s) of Job(%s)", peer, jobName)

		peerJob, err := a.registry.Job(peer)
		if err != nil {
			log.Errorf("Failed fetching Job(%s) from Registry: %v", peer, err)
			return
		}

		if peerJob == nil {
			log.V(1).Infof("Unable to find Peer(%s) of Job(%s) in Registry", peer, jobName)
			return
		}

		if !a.ableToRun(peerJob) {
			log.V(1).Infof("Unable to run Peer(%s) of Job(%s), not bidding", peer, jobName)
			return
		}

		a.bid(peer)
	}
}
Beispiel #13
0
// Purge removes the Agent's state from the Registry
func (a *Agent) Purge() {
	// Continue heartbeating the agent's machine state while attempting to
	// stop all the locally-running jobs
	purged := make(chan bool)
	go a.heartbeatAgent(a.ttl, purged)

	a.state.Lock()
	scheduled := a.state.ScheduledJobs()
	a.state.Unlock()

	machID := a.Machine.State().ID
	for _, jobName := range scheduled {
		log.Infof("Unloading Job(%s) from local machine", jobName)
		a.unloadJob(jobName)
		log.Infof("Unscheduling Job(%s) from local machine", jobName)
		a.registry.ClearJobTarget(jobName, machID)
	}

	// Jobs have been stopped, the heartbeat can stop
	close(purged)

	log.Info("Removing Agent from Registry")
	if err := a.registry.RemoveMachineState(machID); err != nil {
		log.Errorf("Failed to remove Machine %s from Registry: %s", machID, err.Error())
	}
}
Beispiel #14
0
func (r *EtcdRegistry) determineJobState(jobName string) *job.JobState {
	state := job.JobStateInactive

	tgt, err := r.jobTargetMachine(jobName)
	if err != nil {
		log.Errorf("Unable to determine target of Job(%s): %v", jobName, err)
		return nil
	}

	if tgt == "" {
		return &state
	}

	if r.getUnitState(jobName) == nil {
		return &state
	}

	state = job.JobStateLoaded

	agent, pulse := r.CheckJobPulse(jobName)
	if !pulse || agent != tgt {
		return &state
	}

	state = job.JobStateLaunched
	return &state
}
Beispiel #15
0
// sendResponse attempts to marshal an arbitrary thing to JSON then write
// it to the http.ResponseWriter
func sendResponse(rw http.ResponseWriter, code int, resp interface{}) {
	enc, err := json.Marshal(resp)
	if err != nil {
		log.Errorf("Failed JSON-encoding HTTP response: %v", err)
		rw.WriteHeader(http.StatusInternalServerError)
		return
	}

	rw.Header().Set("Content-Type", "application/json")
	rw.WriteHeader(code)

	_, err = rw.Write(enc)
	if err != nil {
		log.Errorf("Failed sending HTTP response body: %v", err)
	}
}
Beispiel #16
0
func (r *Reconciler) Reconcile(e *Engine, stop chan struct{}) {
	log.V(1).Infof("Polling Registry for actionable work")

	clust, err := e.clusterState()
	if err != nil {
		log.Errorf("Failed getting current cluster state: %v", err)
		return
	}

	for t := range r.calculateClusterTasks(clust, stop) {
		err = doTask(t, e)
		if err != nil {
			log.Errorf("Failed resolving task: task=%s err=%v", t, err)
		}
	}
}
Beispiel #17
0
func (e *Engine) OfferJob(j job.Job) error {
	log.V(1).Infof("Attempting to lock Job(%s)", j.Name)

	mutex := e.registry.LockJob(j.Name, e.machine.State().ID)
	if mutex == nil {
		log.V(1).Infof("Could not lock Job(%s)", j.Name)
		return errors.New("could not lock Job")
	}
	defer mutex.Unlock()

	log.V(1).Infof("Claimed Job(%s)", j.Name)

	machineIDs, err := e.partitionCluster(&j)
	if err != nil {
		log.Errorf("failed partitioning cluster for Job(%s): %v", j.Name, err)
		return err
	}

	offer := job.NewOfferFromJob(j, machineIDs)

	err = e.registry.CreateJobOffer(offer)
	if err == nil {
		log.Infof("Published JobOffer(%s)", offer.Job.Name)
	}

	return err
}
Beispiel #18
0
func (ur *unitsResource) create(rw http.ResponseWriter, item string, ds job.JobState, u *unit.Unit) {
	j := job.NewJob(item, *u)

	if err := ur.reg.CreateJob(j); err != nil {
		log.Errorf("Failed creating Job(%s) in Registry: %v", j.Name, err)
		sendError(rw, http.StatusInternalServerError, nil)
		return
	}

	if err := ur.reg.SetJobTargetState(j.Name, ds); err != nil {
		log.Errorf("Failed setting target state of Job(%s): %v", j.Name, err)
		sendError(rw, http.StatusInternalServerError, nil)
		return
	}

	rw.WriteHeader(http.StatusNoContent)
}
Beispiel #19
0
func (r *EtcdRegistry) getJobFromObjectNode(node *etcd.Node) (*job.Job, error) {
	var err error
	var jm jobModel
	if err = unmarshal(node.Value, &jm); err != nil {
		return nil, err
	}

	var unit *unit.Unit

	// New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry
	if !jm.UnitHash.Empty() {
		unit = r.getUnitByHash(jm.UnitHash)
		if unit == nil {
			log.Warningf("No Unit found in Registry for Job(%s)", jm.Name)
			return nil, nil
		}
		if unit.Hash() != jm.UnitHash {
			log.Errorf("Unit Hash %s does not match expected %s for Job(%s)!", unit.Hash(), jm.UnitHash, jm.Name)
			return nil, nil
		}
	} else {
		// Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry
		unit, err = r.getUnitFromLegacyPayload(jm.Name)
		if err != nil {
			log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name)
			return nil, nil
		} else if unit == nil {
			log.Warningf("No Payload found in Registry for Job(%s)", jm.Name)
			return nil, nil
		}

		log.Infof("Migrating legacy Payload(%s)", jm.Name)
		if err := r.storeOrGetUnit(*unit); err != nil {
			log.Warningf("Unable to migrate legacy Payload: %v", err)
		}

		jm.UnitHash = unit.Hash()
		log.Infof("Updating Job(%s) with legacy payload Hash(%s)", jm.Name, jm.UnitHash)
		if err := r.updateJobObjectNode(&jm, node.ModifiedIndex); err != nil {
			log.Warningf("Unable to update Job(%s) with legacy payload Hash(%s): %v", jm.Name, jm.UnitHash, err)
		}
	}

	return job.NewJob(jm.Name, *unit), nil
}
Beispiel #20
0
func (e *Engine) unscheduleJob(jName, machID string) (err error) {
	err = e.registry.ClearJobTarget(jName, machID)
	if err != nil {
		log.Errorf("Failed clearing target Machine(%s) of Job(%s): %v", machID, jName, err)
	} else {
		log.Infof("Unscheduled Job(%s) from Machine(%s)", jName, machID)
	}
	return
}
Beispiel #21
0
func (e *Engine) resolveJobOffer(jName string) (err error) {
	err = e.registry.ResolveJobOffer(jName)
	if err != nil {
		log.Errorf("Failed resolving JobOffer(%s): %v", jName, err)
	} else {
		log.Infof("Resolved JobOffer(%s)", jName)
	}
	return
}
Beispiel #22
0
func (e *Engine) Purge() {
	if e.lease == nil {
		return
	}
	err := e.lease.Release()
	if err != nil {
		log.Errorf("Failed to release lease: %v", err)
	}
}
Beispiel #23
0
// Monitor tracks the health of the Server. If the Server is ever deemed
// unhealthy, the Server is restarted.
func (s *Server) Monitor() {
	err := s.mon.Monitor(s.hrt, s.stop)
	if err != nil {
		log.Errorf("Server monitor triggered: %v", err)

		s.Stop()
		s.Run()
	}
}
Beispiel #24
0
func (e *Engine) offerJob(j *job.Job) (err error) {
	offer := job.NewOfferFromJob(*j)
	err = e.registry.CreateJobOffer(offer)
	if err != nil {
		log.Errorf("Failed publishing JobOffer(%s): %v", j.Name, err)
	} else {
		log.Infof("Published JobOffer(%s)", j.Name)
	}
	return
}
Beispiel #25
0
func (r *dumbReconciler) Reconcile(e *Engine) {
	log.V(1).Infof("Polling Registry for actionable work")

	clust, err := e.clusterState()
	if err != nil {
		log.Errorf("Failed getting current cluster state: %v", err)
		return
	}

	taskchan := make(chan *task)
	go calculateClusterTasks(taskchan, clust)

	for t := range taskchan {
		err = doTask(t, e)
		if err != nil {
			log.Errorf("Failed resolving task: task=%s err=%v", t, err)
		}
	}
}
Beispiel #26
0
// loadJob hands the given Job to systemd without acquiring the
// state mutex. The caller is responsible for acquiring it.
func (a *Agent) loadJob(j *job.Job) {
	log.Infof("Loading Job(%s)", j.Name)
	a.state.SetTargetState(j.Name, job.JobStateLoaded)
	err := a.um.Load(j.Name, j.Unit)
	if err != nil {
		log.Errorf("Failed loading Job(%s): %v", j.Name, err)
		return
	}

	// We must explicitly refresh the payload state, as the dbus
	// event listener does not send an event when we write a unit
	// file to disk.
	us, err := a.um.GetUnitState(j.Name)
	if err != nil {
		log.Errorf("Failed fetching state of Unit(%s): %v", j.Name, err)
		return
	}
	a.ReportUnitState(j.Name, us)
}
Beispiel #27
0
// checkVersion makes a best-effort attempt to verify that fleetctl is at least as new as the
// latest fleet version found registered in the cluster. If any errors are encountered or fleetctl
// is >= the latest version found, it returns true. If it is < the latest found version, it returns
// false and a scary warning to the user.
func checkVersion() (string, bool) {
	fv := version.SemVersion
	lv, err := cAPI.LatestVersion()
	if err != nil {
		log.Errorf("error attempting to check latest fleet version in Registry: %v", err)
	} else if lv != nil && fv.LessThan(*lv) {
		return fmt.Sprintf(oldVersionWarning, fv.String(), lv.String()), false
	}
	return "", true
}
Beispiel #28
0
func (s *Server) Serve() {
	for i, _ := range s.listeners {
		l := s.listeners[i]
		go func() {
			err := http.Serve(l, s)
			if err != nil {
				log.Errorf("Failed serving HTTP on listener: %s", l.Addr)
			}
		}()
	}
}
Beispiel #29
0
// Purge attempts to unload all Jobs that have been loaded locally
func (ar *AgentReconciler) Purge(a *Agent) {
	cAgentState, err := currentAgentState(a)
	if err != nil {
		log.Errorf("Unable to determine agent's current state: %v", err)
		return
	}

	for _, cJob := range cAgentState.jobs {
		t := task{
			Type:   taskTypeUnloadJob,
			Job:    cJob,
			Reason: taskReasonPurgingAgent,
		}

		err := ar.doTask(a, &t)
		if err != nil {
			log.Errorf("Failed resolving task: task=%s err=%q", t, err)
		}
	}
}
Beispiel #30
0
func (j *Job) resourceFromKey(resKey string) int {
	valStr, ok := j.Requirements()[resKey]
	if ok && len(valStr) > 0 {
		val, err := strconv.Atoi(valStr[0])
		if err != nil {
			log.Errorf("failed to parse resource requirement %s from %s: %v", resKey, j.Name, err)
			return 0
		}
		return val
	}
	return 0
}