Beispiel #1
0
// Supervise monitors the life of the Server and coordinates its shutdown.
// A shutdown occurs when the monitor returns, either because a health check
// fails or a user triggers a shutdown. If the shutdown is due to a health
// check failure, the Server is restarted. Supervise will block shutdown until
// all components have finished shutting down or a timeout occurs; if this
// happens, the Server will not automatically be restarted.
func (s *Server) Supervise() {
	sd, err := s.mon.Monitor(s.hrt, s.killc)
	if sd {
		log.Infof("Server monitor triggered: told to shut down")
	} else {
		log.Errorf("Server monitor triggered: %v", err)
	}
	close(s.stopc)
	done := make(chan struct{})
	go func() {
		s.wg.Wait()
		close(done)
	}()
	select {
	case <-done:
	case <-time.After(shutdownTimeout):
		log.Errorf("Timed out waiting for server to shut down. Panicking the server without cleanup.")
		panic("Failed server shutdown. Panic")
	}
	if !sd {
		log.Infof("Restarting server")
		s.SetRestartServer(true)
		s.Run()
		s.SetRestartServer(false)
	}
}
Beispiel #2
0
// ConnectToRegistry allows to disable_engine fleet agents to adapt its Registry
// to fleet leader changes regardless of whether is etcd or gRPC based.
func (r *RegistryMux) ConnectToRegistry(e *engine.Engine) {
	for {
		// We have to check if the leader has changed to etcd otherwise keep grpc connection
		isGrpc, err := e.IsGrpcLeader()
		// If there is not error then we are able to get the leader state and continue
		// otherwise we have to wait
		if err == nil {
			if isGrpc {
				if r.rpcRegistry != nil && r.rpcRegistry.IsRegistryReady() {
					log.Infof("Reusing gRPC engine, connection is READY\n")
					r.currentRegistry = r.rpcRegistry
				} else {
					if r.rpcRegistry != nil {
						r.rpcRegistry.Close()
					}
					log.Infof("New engine supports gRPC, connecting\n")
					r.rpcRegistry = NewRPCRegistry(r.rpcDialerNoEngine)
					// connect to rpc registry
					r.rpcRegistry.Connect()
					r.currentRegistry = r.rpcRegistry
				}
			} else {
				if r.rpcRegistry != nil {
					r.rpcRegistry.Close()
				}
				// new leader is etcd-based
				r.currentRegistry = r.etcdRegistry
			}
		}
		time.Sleep(5 * time.Second)
	}
}
Beispiel #3
0
func (e *Engine) rpcLeadership(leaseTTL time.Duration, machID string) lease.Lease {
	var previousEngine string
	if e.lease != nil {
		previousEngine = e.lease.MachineID()
	}

	var l lease.Lease
	if isLeader(e.lease, machID) {
		l = rpcRenewLeadership(e.lManager, e.lease, engineVersion, leaseTTL)
	} else {
		l = rpcAcquireLeadership(e.registry, e.lManager, machID, engineVersion, leaseTTL)
	}

	// log all leadership changes
	if l != nil && e.lease == nil && l.MachineID() != machID {
		log.Infof("Engine leader is %s", l.MachineID())
	} else if l != nil && e.lease != nil && l.MachineID() != e.lease.MachineID() {
		log.Infof("Engine leadership changed from %s to %s", e.lease.MachineID(), l.MachineID())
	}

	e.lease = l
	if e.lease != nil && previousEngine != e.lease.MachineID() {
		engineState, err := e.getMachineState(e.lease.MachineID())
		if err != nil {
			log.Errorf("Failed to get machine state for machine %s %v", e.lease.MachineID(), err)
		}
		if engineState != nil {
			log.Infof("Updating engine state... engineState: %v previous: %s lease: %v", engineState, previousEngine, e.lease)
			go e.updateEngineState(*engineState)
		}
	}

	return e.lease
}
Beispiel #4
0
func getConfig(flagset *flag.FlagSet, userCfgFile string) (*config.Config, error) {
	opts := globalconf.Options{EnvPrefix: "FLEET_"}

	if userCfgFile != "" {
		// Fail hard if a user-provided config is not usable
		fi, err := os.Stat(userCfgFile)
		if err != nil {
			log.Fatalf("Unable to use config file %s: %v", userCfgFile, err)
		}
		if fi.IsDir() {
			log.Fatalf("Provided config %s is a directory, not a file", userCfgFile)
		}

		log.Infof("Using provided config file %s", userCfgFile)
		opts.Filename = userCfgFile

	} else if _, err := os.Stat(DefaultConfigFile); err == nil {
		log.Infof("Using default config file %s", DefaultConfigFile)
		opts.Filename = DefaultConfigFile
	} else {
		log.Infof("No provided or default config file found - proceeding without")
	}

	gconf, err := globalconf.NewWithOptions(&opts)
	if err != nil {
		return nil, err
	}

	gconf.ParseSet("", flagset)

	cfg := config.Config{
		Verbosity:               (*flagset.Lookup("verbosity")).Value.(flag.Getter).Get().(int),
		EtcdServers:             (*flagset.Lookup("etcd_servers")).Value.(flag.Getter).Get().(pkg.StringSlice),
		EtcdKeyPrefix:           (*flagset.Lookup("etcd_key_prefix")).Value.(flag.Getter).Get().(string),
		EtcdKeyFile:             (*flagset.Lookup("etcd_keyfile")).Value.(flag.Getter).Get().(string),
		EtcdCertFile:            (*flagset.Lookup("etcd_certfile")).Value.(flag.Getter).Get().(string),
		EtcdCAFile:              (*flagset.Lookup("etcd_cafile")).Value.(flag.Getter).Get().(string),
		EtcdRequestTimeout:      (*flagset.Lookup("etcd_request_timeout")).Value.(flag.Getter).Get().(float64),
		EngineReconcileInterval: (*flagset.Lookup("engine_reconcile_interval")).Value.(flag.Getter).Get().(float64),
		PublicIP:                (*flagset.Lookup("public_ip")).Value.(flag.Getter).Get().(string),
		RawMetadata:             (*flagset.Lookup("metadata")).Value.(flag.Getter).Get().(string),
		AgentTTL:                (*flagset.Lookup("agent_ttl")).Value.(flag.Getter).Get().(string),
		VerifyUnits:             (*flagset.Lookup("verify_units")).Value.(flag.Getter).Get().(bool),
		TokenLimit:              (*flagset.Lookup("token_limit")).Value.(flag.Getter).Get().(int),
		AuthorizedKeysFile:      (*flagset.Lookup("authorized_keys_file")).Value.(flag.Getter).Get().(string),
	}

	if cfg.VerifyUnits {
		log.Error("Config option verify_units is no longer supported - ignoring")
	}
	if len(cfg.AuthorizedKeysFile) > 0 {
		log.Error("Config option authorized_keys_file is no longer supported - ignoring")
	}

	if cfg.Verbosity > 0 {
		log.EnableDebug()
	}

	return &cfg, nil
}
Beispiel #5
0
func (s *Server) Run() {
	log.Infof("Establishing etcd connectivity")

	var err error
	for sleep := time.Second; ; sleep = pkg.ExpBackoff(sleep, time.Minute) {
		_, err = s.hrt.Beat(s.mon.TTL)
		if err == nil {
			break
		}
		time.Sleep(sleep)
	}

	log.Infof("Starting server components")

	s.stop = make(chan bool)

	go s.Monitor()
	go s.api.Available(s.stop)
	go s.mach.PeriodicRefresh(machineStateRefreshInterval, s.stop)
	go s.agent.Heartbeat(s.stop)
	go s.aReconciler.Run(s.agent, s.stop)
	if s.disableEngine {
		log.Info("Not starting engine; disable-engine is set")
	} else {
		go s.engine.Run(s.engineReconcileInterval, s.stop)
	}

	beatchan := make(chan *unit.UnitStateHeartbeat)
	go s.usGen.Run(beatchan, s.stop)
	go s.usPub.Run(beatchan, s.stop)
}
Beispiel #6
0
func (ar *AgentReconciler) launchTasks(tasks []task, a *Agent) {
	log.Debugf("AgentReconciler attempting tasks %s", tasks)
	results := ar.tManager.Do(tasks, a)
	for _, res := range results {
		if res.err == nil {
			log.Infof("AgentReconciler completed task: type=%s job=%s reason=%q", res.task.typ, res.task.unit.Name, res.task.reason)
		} else {
			log.Infof("AgentReconciler task failed: type=%s job=%s reason=%q err=%v", res.task.typ, res.task.unit.Name, res.task.reason, res.err)
		}
	}
}
Beispiel #7
0
func runRestartUnit(cCmd *cobra.Command, args []string) (exit int) {
	if len(args) == 0 {
		stderr("No units given")
		return 0
	}
	units, err := findUnits(args)
	if err != nil {
		stderr("%v", err)
		return 1
	}

	if err := lazyCreateUnits(cCmd, args); err != nil {
		stderr("Error creating units: %v", err)
		return 1
	}

	globalUnits := make([]schema.Unit, 0)
	for _, unit := range units {
		if suToGlobal(unit) {
			globalUnits = append(globalUnits, unit)
			continue
		}
		if job.JobState(unit.CurrentState) == job.JobStateInactive {
			stderr("Unable to restart unit %s in state %s", unit.Name, job.JobStateInactive)
			continue
		} else if job.JobState(unit.CurrentState) == job.JobStateLoaded {
			log.Infof("Unit(%s) already %s, starting.", unit.Name, job.JobStateLoaded)

			exit = setUnitStateAndWait(unit, job.JobStateLaunched, getBlockAttempts(cCmd))
			if exit == 1 {
				return exit
			}
			continue
		} else {
			//stop and start it
			exit = setUnitStateAndWait(unit, job.JobStateLoaded, getBlockAttempts(cCmd))
			if exit == 1 {
				return exit
			}
			exit = setUnitStateAndWait(unit, job.JobStateLaunched, getBlockAttempts(cCmd))
			if exit == 1 {
				return exit
			}
		}
		log.Infof("Unit(%s) was restarted.", unit.Name)
	}

	if err := cmdGlobalMachineState(cCmd, globalUnits); err != nil {
		stderr("Error restarting global units %v err:%v", globalUnits, err)
		return 1
	}

	return
}
Beispiel #8
0
func (r *RegistryMux) rpcDialerNoEngine(_ string, timeout time.Duration) (net.Conn, error) {
	ticker := time.Tick(dialRegistryReconnectTimeout)
	// Timeout re-defined to call etcd every 5secs to get the leader
	timeout = 5 * time.Second
	check := time.After(timeout)

	for {
		select {
		case <-check:
			log.Errorf("Unable to connect to engine %s\n", r.currentEngine.PublicIP)
			// Get the new engine leader of the cluster out of etcd
			lease, err := r.leaseManager.GetLease(engineLeaderKeyPath)
			// Key found
			if err == nil && lease != nil {
				var err error
				machines, err := r.etcdRegistry.Machines()
				if err != nil {
					log.Errorf("Unable to get the machines of the cluster %v\n", err)
					return nil, errors.New("Unable to get the machines of the cluster")
				}
				for _, s := range machines {
					// Update the currentEngine with the new one... otherwise wait until
					// there is one
					if s.ID == lease.MachineID() {
						// New leader has not gRPC capabilities enabled.
						if !s.Capabilities.Has(machine.CapGRPC) {
							log.Error("New leader engine has not gRPC enabled!")
							return nil, errors.New("New leader engine has not gRPC enabled!")
						}
						r.currentEngine = s
						log.Infof("Found a new engine to connect to: %s\n", r.currentEngine.PublicIP)
						// Restore initial check configuration
						timeout = 5 * time.Second
						check = time.After(timeout)
					}
				}
			} else {
				timeout = 2 * time.Second
				log.Errorf("Unable to get the leader engine, retrying in %v...", timeout)
				check = time.After(timeout)
			}
		case <-ticker:
			addr := fmt.Sprintf("%s:%d", r.currentEngine.PublicIP, rpcServerPort)
			conn, err := net.Dial("tcp", addr)
			if err == nil {
				log.Infof("Connected to engine on %s\n", r.currentEngine.PublicIP)
				return conn, nil
			}
			log.Errorf("Retry to connect to new engine: %+v", err)
		}
	}
}
Beispiel #9
0
func (s *Server) Run() {
	log.Infof("Establishing etcd connectivity")

	var err error
	for sleep := time.Second; ; sleep = pkg.ExpBackoff(sleep, time.Minute) {
		if s.restartServer {
			_, err = s.hrt.Beat(s.mon.TTL)
			if err == nil {
				log.Infof("hrt.Beat() success")
				break
			}
		} else {
			_, err = s.hrt.Register(s.mon.TTL)
			if err == nil {
				log.Infof("hrt.Register() success")
				break
			}
		}
		log.Warningf("Server register machine failed: %v, retrying in %d sec.", err, sleep)
		time.Sleep(sleep)
	}

	go s.Supervise()

	log.Infof("Starting server components")
	s.stopc = make(chan struct{})
	s.wg = sync.WaitGroup{}
	beatc := make(chan *unit.UnitStateHeartbeat)

	components := []func(){
		func() { s.api.Available(s.stopc) },
		func() { s.mach.PeriodicRefresh(machineStateRefreshInterval, s.stopc) },
		func() { s.agent.Heartbeat(s.stopc) },
		func() { s.aReconciler.Run(s.agent, s.stopc) },
		func() { s.usGen.Run(beatc, s.stopc) },
		func() { s.usPub.Run(beatc, s.stopc) },
	}
	if s.disableEngine {
		log.Info("Not starting engine; disable-engine is set")
	} else {
		components = append(components, func() { s.engine.Run(s.engineReconcileInterval, s.stopc) })
	}
	for _, f := range components {
		f := f
		s.wg.Add(1)
		go func() {
			f()
			s.wg.Done()
		}()
	}
}
Beispiel #10
0
// Resolve attempts to yield a result from the configured action and endpoint. If a usable
// Result or error was not attained, nil values are returned.
func (ar *actionResolver) Resolve(cancel <-chan struct{}) (*Result, error) {
	resp, body, err := ar.exhaust(cancel)
	if err != nil {
		log.Infof("Failed getting response from %v: %v", ar.endpoint, err)
		return nil, nil
	}

	hdlr, ok := handlers[resp.StatusCode]
	if !ok {
		log.Infof("Response %s from %v unusable", resp.Status, ar.endpoint)
		return nil, nil
	}

	return hdlr(resp, body)
}
Beispiel #11
0
func (m *systemdUnitManager) removeUnit(name string) (err error) {
	log.Infof("Removing systemd unit %s", name)

	// both DisableUnitFiles() and ResetFailedUnit() must be followed by
	// removing the unit file. Otherwise "systemctl stop fleet" could end up
	// hanging forever.
	var errf error
	func(name string) {
		_, errf = m.systemd.DisableUnitFiles([]string{name}, true)
		if errf != nil {
			err = fmt.Errorf("%v, %v", err, errf)
		}
	}(name)

	func(name string) {
		errf = m.systemd.ResetFailedUnit(name)
		if errf != nil {
			err = fmt.Errorf("%v, %v", err, errf)
		}
	}(name)

	ufPath := m.getUnitFilePath(name)
	os.Remove(ufPath)

	return err
}
Beispiel #12
0
func (m *systemdUnitManager) stopUnit(name string) {
	if stat, err := m.systemd.StopUnit(name, "replace"); err != nil {
		log.Errorf("Failed to stop systemd unit %s: %v", name, err)
	} else {
		log.Infof("Stopped systemd unit %s(%s)", name, stat)
	}
}
Beispiel #13
0
func acquireLeadership(lManager lease.Manager, machID string, ver int, ttl time.Duration) lease.Lease {
	existing, err := lManager.GetLease(engineLeaseName)
	if err != nil {
		log.Errorf("Unable to determine current lease: %v", err)
		return nil
	}

	var l lease.Lease
	if existing == nil {
		l, err = lManager.AcquireLease(engineLeaseName, machID, ver, ttl)
		if err != nil {
			log.Errorf("Engine leadership acquisition failed: %v", err)
			return nil
		} else if l == nil {
			log.Debugf("Unable to acquire engine leadership")
			return nil
		}
		log.Infof("Engine leadership acquired")
		metrics.ReportEngineLeader()
		return l
	}

	if existing.Version() >= ver {
		log.Debugf("Lease already held by Machine(%s) operating at acceptable version %d", existing.MachineID(), existing.Version())
		return existing
	}

	rem := existing.TimeRemaining()
	l, err = lManager.StealLease(engineLeaseName, machID, ver, ttl+rem, existing.Index())
	if err != nil {
		log.Errorf("Engine leadership steal failed: %v", err)
		return nil
	} else if l == nil {
		log.Debugf("Unable to steal engine leadership")
		return nil
	}

	log.Infof("Stole engine leadership from Machine(%s)", existing.MachineID())
	metrics.ReportEngineLeader()

	if rem > 0 {
		log.Infof("Waiting %v for previous lease to expire before continuing reconciliation", rem)
		<-time.After(rem)
	}

	return l
}
Beispiel #14
0
func (m *systemdUnitManager) enableUnit(name string) (bool, error) {
	log.Infof("Enabling systemd unit %s", name)

	ufPath := m.getUnitFilePath(name)

	ok, _, err := m.systemd.EnableUnitFiles([]string{ufPath}, true, true)
	return ok, err
}
Beispiel #15
0
// getUnitFromObject takes a *etcd.Node containing a Unit's jobModel, and
// instantiates and returns a representative *job.Unit, transitively fetching the
// associated UnitFile as necessary
func (r *EtcdRegistry) getUnitFromObjectNode(node *etcd.Node) (*job.Unit, error) {
	var err error
	var jm jobModel
	if err = unmarshal(node.Value, &jm); err != nil {
		return nil, err
	}

	var unit *unit.UnitFile

	// New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry
	if !jm.UnitHash.Empty() {
		unit = r.getUnitByHash(jm.UnitHash)
		if unit == nil {
			log.Warningf("No Unit found in Registry for Job(%s)", jm.Name)
			return nil, nil
		}
	} else {
		// Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry
		unit, err = r.getUnitFromLegacyPayload(jm.Name)
		if err != nil {
			log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name)
			return nil, nil
		} else if unit == nil {
			log.Warningf("No Payload found in Registry for Job(%s)", jm.Name)
			return nil, nil
		}

		log.Infof("Migrating legacy Payload(%s)", jm.Name)
		if err := r.storeOrGetUnitFile(*unit); err != nil {
			log.Warningf("Unable to migrate legacy Payload: %v", err)
		}

		jm.UnitHash = unit.Hash()
		log.Infof("Updating Job(%s) with legacy payload Hash(%s)", jm.Name, jm.UnitHash)
		if err := r.updateJobObjectNode(&jm, node.ModifiedIndex); err != nil {
			log.Warningf("Unable to update Job(%s) with legacy payload Hash(%s): %v", jm.Name, jm.UnitHash, err)
		}
	}

	ju := &job.Unit{
		Name: jm.Name,
		Unit: *unit,
	}
	return ju, nil

}
Beispiel #16
0
// TriggerStop asynchronously starts the unit identified by the given name.
// This function does not block for the underlying unit to actually stop.
func (m *systemdUnitManager) TriggerStop(name string) {
	jobID, err := m.systemd.StopUnit(name, "replace", nil)
	if err == nil {
		log.Infof("Triggered systemd unit %s stop: job=%d", name, jobID)
	} else {
		log.Errorf("Failed to trigger systemd unit %s stop: %v", name, err)
	}
}
Beispiel #17
0
// TriggerStart asynchronously starts the unit identified by the given name.
// This function does not block for the underlying unit to actually start.
func (m *systemdUnitManager) TriggerStart(name string) error {
	jobID, err := m.systemd.StartUnit(name, "replace", nil)
	if err != nil {
		log.Errorf("Failed to trigger systemd unit %s start: %v", name, err)
		return err
	}
	log.Infof("Triggered systemd unit %s start: job=%d", name, jobID)
	return nil
}
Beispiel #18
0
func (m *systemdUnitManager) removeUnit(name string) {
	log.Infof("Removing systemd unit %s", name)

	m.systemd.DisableUnitFiles([]string{name}, true)
	m.systemd.ResetFailedUnit(name)

	ufPath := m.getUnitFilePath(name)
	os.Remove(ufPath)
}
Beispiel #19
0
func (ar *AgentReconciler) launchTaskChain(tc taskChain, a *Agent) {
	log.V(1).Infof("AgentReconciler attempting task chain: %s", tc)
	reschan, err := ar.tManager.Do(tc, a)
	if err != nil {
		log.Infof("AgentReconciler task chain failed: chain=%s err=%v", tc, err)
		return
	}

	go func() {
		for res := range reschan {
			if res.err == nil {
				log.Infof("AgentReconciler completed task: type=%s job=%s reason=%q", res.task.typ, tc.job.Name, res.task.reason)
			} else {
				log.Infof("AgentReconciler task failed: type=%s job=%s reason=%q err=%v", res.task.typ, tc.job.Name, res.task.reason, res.err)
			}
		}
	}()
}
Beispiel #20
0
func (e *Engine) unscheduleUnit(name, machID string) (err error) {
	err = e.registry.UnscheduleUnit(name, machID)
	if err != nil {
		log.Errorf("Failed unscheduling Unit(%s) from Machine(%s): %v", name, machID, err)
	} else {
		log.Infof("Unscheduled Job(%s) from Machine(%s)", name, machID)
	}
	return
}
Beispiel #21
0
// attemptScheduleUnit tries to persist a scheduling decision in the
// Registry, returning true on success. If any communication with the
// Registry fails, false is returned.
func (e *Engine) attemptScheduleUnit(name, machID string) bool {
	err := e.registry.ScheduleUnit(name, machID)
	if err != nil {
		log.Errorf("Failed scheduling Unit(%s) to Machine(%s): %v", name, machID, err)
		return false
	}

	log.Infof("Scheduled Unit(%s) to Machine(%s)", name, machID)
	return true
}
Beispiel #22
0
func (r *RegistryMux) EngineChanged(newEngine machine.MachineState) {
	r.handlingEngineChange.Lock()
	defer r.handlingEngineChange.Unlock()

	stopServer := false
	if r.currentEngine.ID != newEngine.ID {
		stopServer = true
	}
	r.currentEngine = newEngine
	log.Infof("Engine changed, checking capabilities %+v", newEngine)
	if r.localMachine.State().Capabilities.Has(machine.CapGRPC) {
		if r.rpcserver != nil && ((r.rpcRegistry != nil && !r.rpcRegistry.IsRegistryReady()) || stopServer) {
			// If the engine changed, we need to stop the rpc server
			r.rpcserver.Stop()
			r.rpcserver = nil
		}
		if newEngine.ID == r.localMachine.State().ID {
			if r.rpcserver == nil {
				// start rpc server
				log.Infof("Starting rpc server...\n")
				var err error
				r.rpcserver, err = NewRPCServer(r.etcdRegistry, newEngine.PublicIP)
				if err != nil {
					log.Fatalf("Unable to create rpc server %+v", err)
				}

				go func() {
					errc := make(chan error, 1)
					if errc <- r.rpcserver.Start(); <-errc != nil {
						log.Fatalf("Failed to serve gRPC requests on listener: %v", <-errc)
					}
				}()
			}

		}
		if newEngine.Capabilities.Has(machine.CapGRPC) {
			if r.rpcRegistry != nil && r.rpcRegistry.IsRegistryReady() {
				log.Infof("Reusing gRPC engine, connection is READY\n")
				r.currentRegistry = r.rpcRegistry
			} else {
				log.Infof("New engine supports gRPC, connecting\n")
				r.rpcRegistry = NewRPCRegistry(r.rpcDialer)
				// connect to rpc registry
				r.rpcRegistry.Connect()
				r.currentRegistry = r.rpcRegistry
			}
		} else {
			log.Infof("Falling back to etcd registry\n")
			if r.rpcserver != nil {
				// If the engine changed to a non gRPC leader, we need to stop the server
				r.rpcserver.Stop()
			}
			r.currentRegistry = r.etcdRegistry
		}

	} else {
		log.Infof("Falling back to etcd registry\n")
		r.currentRegistry = r.etcdRegistry
	}
}
Beispiel #23
0
func (m *systemdUnitManager) writeUnit(name string, contents string) error {
	bContents := []byte(contents)
	log.Infof("Writing systemd unit %s (%db)", name, len(bContents))

	ufPath := m.getUnitFilePath(name)
	err := ioutil.WriteFile(ufPath, bContents, os.FileMode(0644))
	if err != nil {
		return err
	}

	_, err = m.systemd.LinkUnitFiles([]string{ufPath}, true, true)
	return err
}
Beispiel #24
0
// Supervise monitors the life of the Server and coordinates its shutdown.
// A shutdown occurs when the monitor returns, either because a health check
// fails or a user triggers a shutdown. If the shutdown is due to a health
// check failure, the Server is restarted. Supervise will block shutdown until
// all components have finished shutting down or a timeout occurs; if this
// happens, the Server will not automatically be restarted.
func (s *Server) Supervise() {
	sd, err := s.mon.Monitor(s.hrt, s.killc)
	if sd {
		log.Infof("Server monitor triggered: told to shut down")
	} else {
		log.Errorf("Server monitor triggered: %v", err)
	}
	close(s.stopc)
	done := make(chan struct{})
	go func() {
		s.wg.Wait()
		close(done)
	}()
	select {
	case <-done:
	case <-time.After(shutdownTimeout):
		log.Errorf("Timed out waiting for server to shut down")
		sd = true
	}
	if !sd {
		log.Infof("Restarting server")
		s.Run()
	}
}
Beispiel #25
0
func rpcRenewLeadership(lManager lease.Manager, l lease.Lease, ver int, ttl time.Duration) lease.Lease {
	err := l.Renew(ttl)
	if err != nil {
		if eerr, ok := err.(*etcdErr.Error); ok && eerr.ErrorCode == etcdErr.EcodeKeyNotFound {
			log.Errorf("Retry renew etcd operation that failed due to %v", err)
			l, err = lManager.AcquireLease(engineLeaseName, l.MachineID(), ver, ttl)
			if err != nil {
				log.Errorf("Engine leadership re-acquisition failed: %v", err)
				return nil
			} else if l == nil {
				log.Infof("Unable to re-acquire engine leadership")
				return nil
			}
			log.Infof("Engine leadership re-acquired")
			return l
		} else {
			log.Errorf("Engine leadership lost, renewal failed: %v", err)
			return nil
		}
	}

	log.Debugf("Engine leadership renewed")
	return l
}
Beispiel #26
0
func doTask(t *task, e *Engine) (err error) {
	switch t.Type {
	case taskTypeUnscheduleUnit:
		err = e.unscheduleUnit(t.JobName, t.MachineID)
	case taskTypeAttemptScheduleUnit:
		e.attemptScheduleUnit(t.JobName, t.MachineID)
	default:
		err = fmt.Errorf("unrecognized task type %q", t.Type)
	}

	if err == nil {
		log.Infof("EngineReconciler completed task: %s", t)
	}

	return
}
Beispiel #27
0
func rpcAcquireLeadership(reg registry.Registry, lManager lease.Manager, machID string, ver int, ttl time.Duration) lease.Lease {
	existing, err := lManager.GetLease(engineLeaseName)
	if err != nil {
		log.Errorf("Unable to determine current lease: %v", err)
		return nil
	}

	var l lease.Lease
	if (existing == nil && reg.UseEtcdRegistry()) || (existing == nil && !reg.IsRegistryReady()) {
		l, err = lManager.AcquireLease(engineLeaseName, machID, ver, ttl)
		if err != nil {
			log.Errorf("Engine leadership acquisition failed: %v", err)
			return nil
		} else if l == nil {
			log.Infof("Unable to acquire engine leadership")
			return nil
		}
		log.Infof("Engine leadership acquired")
		return l
	}

	if existing != nil && existing.Version() >= ver {
		log.Debugf("Lease already held by Machine(%s) operating at acceptable version %d", existing.MachineID(), existing.Version())
		return existing
	}

	// TODO(hector): Here we could add a possible SLA to determine when the leader
	// is too busy. In such a case, we can trigger a new leader election
	if (existing != nil && reg.UseEtcdRegistry()) || (existing != nil && !reg.IsRegistryReady()) {
		rem := existing.TimeRemaining()
		l, err = lManager.StealLease(engineLeaseName, machID, ver, ttl+rem, existing.Index())
		if err != nil {
			log.Errorf("Engine leadership steal failed: %v", err)
			return nil
		} else if l == nil {
			log.Infof("Unable to steal engine leadership")
			return nil
		}

		log.Infof("Stole engine leadership from Machine(%s)", existing.MachineID())

		if rem > 0 {
			log.Infof("Waiting %v for previous lease to expire before continuing reconciliation", rem)
			<-time.After(rem)
		}

		return l
	}

	log.Infof("Engine leader is BUSY!")

	return existing

}
Beispiel #28
0
func NewRPCServer(reg registry.Registry, addr string) (*rpcserver, error) {
	s := &rpcserver{
		etcdRegistry:  reg,
		mu:            new(sync.Mutex),
		localRegistry: newInmemoryRegistry(),
		stop:          make(chan struct{}),
	}
	var err error
	tcpAddr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("%s:%d", addr, rpcServerPort))
	if err != nil {
		return nil, err
	}
	for it := 0; it < bindAddrMaxRetry; it++ {
		s.listener, err = net.ListenTCP("tcp", tcpAddr)
		if err == nil {
			break
		}
		log.Infof("Retrying %d to bind %s address... %v", it, tcpAddr, err)
		time.Sleep(bindRetryTimeout)
	}
	if err != nil {
		return nil, err
	}

	s.grpcserver = grpc.NewServer()
	s.localRegistry.LoadFrom(s.etcdRegistry)
	pb.RegisterRegistryServer(s.grpcserver, s)

	s.SetServingStatus(pb.HealthCheckResponse_NOT_SERVING)

	machineStates, err := s.etcdRegistry.Machines()
	if err != nil {
		return nil, err
	}
	s.hasNonGRPCAgents = false
	for _, state := range machineStates {
		if !state.Capabilities.Has(machine.CapGRPC) {
			log.Info("Fleet cluster has non gRPC agents!. Enabled unit state storage into etcd!")
			s.hasNonGRPCAgents = true
			break
		}
	}
	return s, nil
}
Beispiel #29
0
func (r *RegistryMux) rpcDialer(_ string, timeout time.Duration) (net.Conn, error) {
	ticker := time.Tick(dialRegistryReconnectTimeout)
	alert := time.After(timeout)

	for {
		select {
		case <-alert:
			log.Errorf("Unable to connect to engine %s\n", r.currentEngine.PublicIP)
			return nil, errors.New("Unable to connect to new engine, the client connection is closing")
		case <-ticker:
			addr := fmt.Sprintf("%s:%d", r.currentEngine.PublicIP, rpcServerPort)
			conn, err := net.Dial("tcp", addr)
			if err == nil {
				log.Infof("Connected to engine on %s\n", r.currentEngine.PublicIP)
				return conn, nil
			}
			log.Errorf("Retry to connect to new engine: %+v", err)
		}
	}
}
Beispiel #30
0
func ensureEngineVersionMatch(cReg registry.ClusterRegistry, expect int) bool {
	v, err := cReg.EngineVersion()
	if err != nil {
		log.Errorf("Unable to determine cluster engine version")
		return false
	}

	if v < expect {
		err = cReg.UpdateEngineVersion(v, expect)
		if err != nil {
			log.Errorf("Failed updating cluster engine version from %d to %d: %v", v, expect, err)
			return false
		}
		log.Infof("Updated cluster engine version from %d to %d", v, expect)
	} else if v > expect {
		log.Debugf("Cluster engine version higher than local engine version (%d > %d), unable to participate", v, expect)
		return false
	}

	return true
}