// Supervise monitors the life of the Server and coordinates its shutdown. // A shutdown occurs when the monitor returns, either because a health check // fails or a user triggers a shutdown. If the shutdown is due to a health // check failure, the Server is restarted. Supervise will block shutdown until // all components have finished shutting down or a timeout occurs; if this // happens, the Server will not automatically be restarted. func (s *Server) Supervise() { sd, err := s.mon.Monitor(s.hrt, s.killc) if sd { log.Infof("Server monitor triggered: told to shut down") } else { log.Errorf("Server monitor triggered: %v", err) } close(s.stopc) done := make(chan struct{}) go func() { s.wg.Wait() close(done) }() select { case <-done: case <-time.After(shutdownTimeout): log.Errorf("Timed out waiting for server to shut down. Panicking the server without cleanup.") panic("Failed server shutdown. Panic") } if !sd { log.Infof("Restarting server") s.SetRestartServer(true) s.Run() s.SetRestartServer(false) } }
// ConnectToRegistry allows to disable_engine fleet agents to adapt its Registry // to fleet leader changes regardless of whether is etcd or gRPC based. func (r *RegistryMux) ConnectToRegistry(e *engine.Engine) { for { // We have to check if the leader has changed to etcd otherwise keep grpc connection isGrpc, err := e.IsGrpcLeader() // If there is not error then we are able to get the leader state and continue // otherwise we have to wait if err == nil { if isGrpc { if r.rpcRegistry != nil && r.rpcRegistry.IsRegistryReady() { log.Infof("Reusing gRPC engine, connection is READY\n") r.currentRegistry = r.rpcRegistry } else { if r.rpcRegistry != nil { r.rpcRegistry.Close() } log.Infof("New engine supports gRPC, connecting\n") r.rpcRegistry = NewRPCRegistry(r.rpcDialerNoEngine) // connect to rpc registry r.rpcRegistry.Connect() r.currentRegistry = r.rpcRegistry } } else { if r.rpcRegistry != nil { r.rpcRegistry.Close() } // new leader is etcd-based r.currentRegistry = r.etcdRegistry } } time.Sleep(5 * time.Second) } }
func (e *Engine) rpcLeadership(leaseTTL time.Duration, machID string) lease.Lease { var previousEngine string if e.lease != nil { previousEngine = e.lease.MachineID() } var l lease.Lease if isLeader(e.lease, machID) { l = rpcRenewLeadership(e.lManager, e.lease, engineVersion, leaseTTL) } else { l = rpcAcquireLeadership(e.registry, e.lManager, machID, engineVersion, leaseTTL) } // log all leadership changes if l != nil && e.lease == nil && l.MachineID() != machID { log.Infof("Engine leader is %s", l.MachineID()) } else if l != nil && e.lease != nil && l.MachineID() != e.lease.MachineID() { log.Infof("Engine leadership changed from %s to %s", e.lease.MachineID(), l.MachineID()) } e.lease = l if e.lease != nil && previousEngine != e.lease.MachineID() { engineState, err := e.getMachineState(e.lease.MachineID()) if err != nil { log.Errorf("Failed to get machine state for machine %s %v", e.lease.MachineID(), err) } if engineState != nil { log.Infof("Updating engine state... engineState: %v previous: %s lease: %v", engineState, previousEngine, e.lease) go e.updateEngineState(*engineState) } } return e.lease }
func getConfig(flagset *flag.FlagSet, userCfgFile string) (*config.Config, error) { opts := globalconf.Options{EnvPrefix: "FLEET_"} if userCfgFile != "" { // Fail hard if a user-provided config is not usable fi, err := os.Stat(userCfgFile) if err != nil { log.Fatalf("Unable to use config file %s: %v", userCfgFile, err) } if fi.IsDir() { log.Fatalf("Provided config %s is a directory, not a file", userCfgFile) } log.Infof("Using provided config file %s", userCfgFile) opts.Filename = userCfgFile } else if _, err := os.Stat(DefaultConfigFile); err == nil { log.Infof("Using default config file %s", DefaultConfigFile) opts.Filename = DefaultConfigFile } else { log.Infof("No provided or default config file found - proceeding without") } gconf, err := globalconf.NewWithOptions(&opts) if err != nil { return nil, err } gconf.ParseSet("", flagset) cfg := config.Config{ Verbosity: (*flagset.Lookup("verbosity")).Value.(flag.Getter).Get().(int), EtcdServers: (*flagset.Lookup("etcd_servers")).Value.(flag.Getter).Get().(pkg.StringSlice), EtcdKeyPrefix: (*flagset.Lookup("etcd_key_prefix")).Value.(flag.Getter).Get().(string), EtcdKeyFile: (*flagset.Lookup("etcd_keyfile")).Value.(flag.Getter).Get().(string), EtcdCertFile: (*flagset.Lookup("etcd_certfile")).Value.(flag.Getter).Get().(string), EtcdCAFile: (*flagset.Lookup("etcd_cafile")).Value.(flag.Getter).Get().(string), EtcdRequestTimeout: (*flagset.Lookup("etcd_request_timeout")).Value.(flag.Getter).Get().(float64), EngineReconcileInterval: (*flagset.Lookup("engine_reconcile_interval")).Value.(flag.Getter).Get().(float64), PublicIP: (*flagset.Lookup("public_ip")).Value.(flag.Getter).Get().(string), RawMetadata: (*flagset.Lookup("metadata")).Value.(flag.Getter).Get().(string), AgentTTL: (*flagset.Lookup("agent_ttl")).Value.(flag.Getter).Get().(string), VerifyUnits: (*flagset.Lookup("verify_units")).Value.(flag.Getter).Get().(bool), TokenLimit: (*flagset.Lookup("token_limit")).Value.(flag.Getter).Get().(int), AuthorizedKeysFile: (*flagset.Lookup("authorized_keys_file")).Value.(flag.Getter).Get().(string), } if cfg.VerifyUnits { log.Error("Config option verify_units is no longer supported - ignoring") } if len(cfg.AuthorizedKeysFile) > 0 { log.Error("Config option authorized_keys_file is no longer supported - ignoring") } if cfg.Verbosity > 0 { log.EnableDebug() } return &cfg, nil }
func (s *Server) Run() { log.Infof("Establishing etcd connectivity") var err error for sleep := time.Second; ; sleep = pkg.ExpBackoff(sleep, time.Minute) { _, err = s.hrt.Beat(s.mon.TTL) if err == nil { break } time.Sleep(sleep) } log.Infof("Starting server components") s.stop = make(chan bool) go s.Monitor() go s.api.Available(s.stop) go s.mach.PeriodicRefresh(machineStateRefreshInterval, s.stop) go s.agent.Heartbeat(s.stop) go s.aReconciler.Run(s.agent, s.stop) if s.disableEngine { log.Info("Not starting engine; disable-engine is set") } else { go s.engine.Run(s.engineReconcileInterval, s.stop) } beatchan := make(chan *unit.UnitStateHeartbeat) go s.usGen.Run(beatchan, s.stop) go s.usPub.Run(beatchan, s.stop) }
func (ar *AgentReconciler) launchTasks(tasks []task, a *Agent) { log.Debugf("AgentReconciler attempting tasks %s", tasks) results := ar.tManager.Do(tasks, a) for _, res := range results { if res.err == nil { log.Infof("AgentReconciler completed task: type=%s job=%s reason=%q", res.task.typ, res.task.unit.Name, res.task.reason) } else { log.Infof("AgentReconciler task failed: type=%s job=%s reason=%q err=%v", res.task.typ, res.task.unit.Name, res.task.reason, res.err) } } }
func runRestartUnit(cCmd *cobra.Command, args []string) (exit int) { if len(args) == 0 { stderr("No units given") return 0 } units, err := findUnits(args) if err != nil { stderr("%v", err) return 1 } if err := lazyCreateUnits(cCmd, args); err != nil { stderr("Error creating units: %v", err) return 1 } globalUnits := make([]schema.Unit, 0) for _, unit := range units { if suToGlobal(unit) { globalUnits = append(globalUnits, unit) continue } if job.JobState(unit.CurrentState) == job.JobStateInactive { stderr("Unable to restart unit %s in state %s", unit.Name, job.JobStateInactive) continue } else if job.JobState(unit.CurrentState) == job.JobStateLoaded { log.Infof("Unit(%s) already %s, starting.", unit.Name, job.JobStateLoaded) exit = setUnitStateAndWait(unit, job.JobStateLaunched, getBlockAttempts(cCmd)) if exit == 1 { return exit } continue } else { //stop and start it exit = setUnitStateAndWait(unit, job.JobStateLoaded, getBlockAttempts(cCmd)) if exit == 1 { return exit } exit = setUnitStateAndWait(unit, job.JobStateLaunched, getBlockAttempts(cCmd)) if exit == 1 { return exit } } log.Infof("Unit(%s) was restarted.", unit.Name) } if err := cmdGlobalMachineState(cCmd, globalUnits); err != nil { stderr("Error restarting global units %v err:%v", globalUnits, err) return 1 } return }
func (r *RegistryMux) rpcDialerNoEngine(_ string, timeout time.Duration) (net.Conn, error) { ticker := time.Tick(dialRegistryReconnectTimeout) // Timeout re-defined to call etcd every 5secs to get the leader timeout = 5 * time.Second check := time.After(timeout) for { select { case <-check: log.Errorf("Unable to connect to engine %s\n", r.currentEngine.PublicIP) // Get the new engine leader of the cluster out of etcd lease, err := r.leaseManager.GetLease(engineLeaderKeyPath) // Key found if err == nil && lease != nil { var err error machines, err := r.etcdRegistry.Machines() if err != nil { log.Errorf("Unable to get the machines of the cluster %v\n", err) return nil, errors.New("Unable to get the machines of the cluster") } for _, s := range machines { // Update the currentEngine with the new one... otherwise wait until // there is one if s.ID == lease.MachineID() { // New leader has not gRPC capabilities enabled. if !s.Capabilities.Has(machine.CapGRPC) { log.Error("New leader engine has not gRPC enabled!") return nil, errors.New("New leader engine has not gRPC enabled!") } r.currentEngine = s log.Infof("Found a new engine to connect to: %s\n", r.currentEngine.PublicIP) // Restore initial check configuration timeout = 5 * time.Second check = time.After(timeout) } } } else { timeout = 2 * time.Second log.Errorf("Unable to get the leader engine, retrying in %v...", timeout) check = time.After(timeout) } case <-ticker: addr := fmt.Sprintf("%s:%d", r.currentEngine.PublicIP, rpcServerPort) conn, err := net.Dial("tcp", addr) if err == nil { log.Infof("Connected to engine on %s\n", r.currentEngine.PublicIP) return conn, nil } log.Errorf("Retry to connect to new engine: %+v", err) } } }
func (s *Server) Run() { log.Infof("Establishing etcd connectivity") var err error for sleep := time.Second; ; sleep = pkg.ExpBackoff(sleep, time.Minute) { if s.restartServer { _, err = s.hrt.Beat(s.mon.TTL) if err == nil { log.Infof("hrt.Beat() success") break } } else { _, err = s.hrt.Register(s.mon.TTL) if err == nil { log.Infof("hrt.Register() success") break } } log.Warningf("Server register machine failed: %v, retrying in %d sec.", err, sleep) time.Sleep(sleep) } go s.Supervise() log.Infof("Starting server components") s.stopc = make(chan struct{}) s.wg = sync.WaitGroup{} beatc := make(chan *unit.UnitStateHeartbeat) components := []func(){ func() { s.api.Available(s.stopc) }, func() { s.mach.PeriodicRefresh(machineStateRefreshInterval, s.stopc) }, func() { s.agent.Heartbeat(s.stopc) }, func() { s.aReconciler.Run(s.agent, s.stopc) }, func() { s.usGen.Run(beatc, s.stopc) }, func() { s.usPub.Run(beatc, s.stopc) }, } if s.disableEngine { log.Info("Not starting engine; disable-engine is set") } else { components = append(components, func() { s.engine.Run(s.engineReconcileInterval, s.stopc) }) } for _, f := range components { f := f s.wg.Add(1) go func() { f() s.wg.Done() }() } }
// Resolve attempts to yield a result from the configured action and endpoint. If a usable // Result or error was not attained, nil values are returned. func (ar *actionResolver) Resolve(cancel <-chan struct{}) (*Result, error) { resp, body, err := ar.exhaust(cancel) if err != nil { log.Infof("Failed getting response from %v: %v", ar.endpoint, err) return nil, nil } hdlr, ok := handlers[resp.StatusCode] if !ok { log.Infof("Response %s from %v unusable", resp.Status, ar.endpoint) return nil, nil } return hdlr(resp, body) }
func (m *systemdUnitManager) removeUnit(name string) (err error) { log.Infof("Removing systemd unit %s", name) // both DisableUnitFiles() and ResetFailedUnit() must be followed by // removing the unit file. Otherwise "systemctl stop fleet" could end up // hanging forever. var errf error func(name string) { _, errf = m.systemd.DisableUnitFiles([]string{name}, true) if errf != nil { err = fmt.Errorf("%v, %v", err, errf) } }(name) func(name string) { errf = m.systemd.ResetFailedUnit(name) if errf != nil { err = fmt.Errorf("%v, %v", err, errf) } }(name) ufPath := m.getUnitFilePath(name) os.Remove(ufPath) return err }
func (m *systemdUnitManager) stopUnit(name string) { if stat, err := m.systemd.StopUnit(name, "replace"); err != nil { log.Errorf("Failed to stop systemd unit %s: %v", name, err) } else { log.Infof("Stopped systemd unit %s(%s)", name, stat) } }
func acquireLeadership(lManager lease.Manager, machID string, ver int, ttl time.Duration) lease.Lease { existing, err := lManager.GetLease(engineLeaseName) if err != nil { log.Errorf("Unable to determine current lease: %v", err) return nil } var l lease.Lease if existing == nil { l, err = lManager.AcquireLease(engineLeaseName, machID, ver, ttl) if err != nil { log.Errorf("Engine leadership acquisition failed: %v", err) return nil } else if l == nil { log.Debugf("Unable to acquire engine leadership") return nil } log.Infof("Engine leadership acquired") metrics.ReportEngineLeader() return l } if existing.Version() >= ver { log.Debugf("Lease already held by Machine(%s) operating at acceptable version %d", existing.MachineID(), existing.Version()) return existing } rem := existing.TimeRemaining() l, err = lManager.StealLease(engineLeaseName, machID, ver, ttl+rem, existing.Index()) if err != nil { log.Errorf("Engine leadership steal failed: %v", err) return nil } else if l == nil { log.Debugf("Unable to steal engine leadership") return nil } log.Infof("Stole engine leadership from Machine(%s)", existing.MachineID()) metrics.ReportEngineLeader() if rem > 0 { log.Infof("Waiting %v for previous lease to expire before continuing reconciliation", rem) <-time.After(rem) } return l }
func (m *systemdUnitManager) enableUnit(name string) (bool, error) { log.Infof("Enabling systemd unit %s", name) ufPath := m.getUnitFilePath(name) ok, _, err := m.systemd.EnableUnitFiles([]string{ufPath}, true, true) return ok, err }
// getUnitFromObject takes a *etcd.Node containing a Unit's jobModel, and // instantiates and returns a representative *job.Unit, transitively fetching the // associated UnitFile as necessary func (r *EtcdRegistry) getUnitFromObjectNode(node *etcd.Node) (*job.Unit, error) { var err error var jm jobModel if err = unmarshal(node.Value, &jm); err != nil { return nil, err } var unit *unit.UnitFile // New-style Jobs should have a populated UnitHash, and the contents of the Unit are stored separately in the Registry if !jm.UnitHash.Empty() { unit = r.getUnitByHash(jm.UnitHash) if unit == nil { log.Warningf("No Unit found in Registry for Job(%s)", jm.Name) return nil, nil } } else { // Old-style Jobs had "Payloads" instead of Units, also stored separately in the Registry unit, err = r.getUnitFromLegacyPayload(jm.Name) if err != nil { log.Errorf("Error retrieving legacy payload for Job(%s)", jm.Name) return nil, nil } else if unit == nil { log.Warningf("No Payload found in Registry for Job(%s)", jm.Name) return nil, nil } log.Infof("Migrating legacy Payload(%s)", jm.Name) if err := r.storeOrGetUnitFile(*unit); err != nil { log.Warningf("Unable to migrate legacy Payload: %v", err) } jm.UnitHash = unit.Hash() log.Infof("Updating Job(%s) with legacy payload Hash(%s)", jm.Name, jm.UnitHash) if err := r.updateJobObjectNode(&jm, node.ModifiedIndex); err != nil { log.Warningf("Unable to update Job(%s) with legacy payload Hash(%s): %v", jm.Name, jm.UnitHash, err) } } ju := &job.Unit{ Name: jm.Name, Unit: *unit, } return ju, nil }
// TriggerStop asynchronously starts the unit identified by the given name. // This function does not block for the underlying unit to actually stop. func (m *systemdUnitManager) TriggerStop(name string) { jobID, err := m.systemd.StopUnit(name, "replace", nil) if err == nil { log.Infof("Triggered systemd unit %s stop: job=%d", name, jobID) } else { log.Errorf("Failed to trigger systemd unit %s stop: %v", name, err) } }
// TriggerStart asynchronously starts the unit identified by the given name. // This function does not block for the underlying unit to actually start. func (m *systemdUnitManager) TriggerStart(name string) error { jobID, err := m.systemd.StartUnit(name, "replace", nil) if err != nil { log.Errorf("Failed to trigger systemd unit %s start: %v", name, err) return err } log.Infof("Triggered systemd unit %s start: job=%d", name, jobID) return nil }
func (m *systemdUnitManager) removeUnit(name string) { log.Infof("Removing systemd unit %s", name) m.systemd.DisableUnitFiles([]string{name}, true) m.systemd.ResetFailedUnit(name) ufPath := m.getUnitFilePath(name) os.Remove(ufPath) }
func (ar *AgentReconciler) launchTaskChain(tc taskChain, a *Agent) { log.V(1).Infof("AgentReconciler attempting task chain: %s", tc) reschan, err := ar.tManager.Do(tc, a) if err != nil { log.Infof("AgentReconciler task chain failed: chain=%s err=%v", tc, err) return } go func() { for res := range reschan { if res.err == nil { log.Infof("AgentReconciler completed task: type=%s job=%s reason=%q", res.task.typ, tc.job.Name, res.task.reason) } else { log.Infof("AgentReconciler task failed: type=%s job=%s reason=%q err=%v", res.task.typ, tc.job.Name, res.task.reason, res.err) } } }() }
func (e *Engine) unscheduleUnit(name, machID string) (err error) { err = e.registry.UnscheduleUnit(name, machID) if err != nil { log.Errorf("Failed unscheduling Unit(%s) from Machine(%s): %v", name, machID, err) } else { log.Infof("Unscheduled Job(%s) from Machine(%s)", name, machID) } return }
// attemptScheduleUnit tries to persist a scheduling decision in the // Registry, returning true on success. If any communication with the // Registry fails, false is returned. func (e *Engine) attemptScheduleUnit(name, machID string) bool { err := e.registry.ScheduleUnit(name, machID) if err != nil { log.Errorf("Failed scheduling Unit(%s) to Machine(%s): %v", name, machID, err) return false } log.Infof("Scheduled Unit(%s) to Machine(%s)", name, machID) return true }
func (r *RegistryMux) EngineChanged(newEngine machine.MachineState) { r.handlingEngineChange.Lock() defer r.handlingEngineChange.Unlock() stopServer := false if r.currentEngine.ID != newEngine.ID { stopServer = true } r.currentEngine = newEngine log.Infof("Engine changed, checking capabilities %+v", newEngine) if r.localMachine.State().Capabilities.Has(machine.CapGRPC) { if r.rpcserver != nil && ((r.rpcRegistry != nil && !r.rpcRegistry.IsRegistryReady()) || stopServer) { // If the engine changed, we need to stop the rpc server r.rpcserver.Stop() r.rpcserver = nil } if newEngine.ID == r.localMachine.State().ID { if r.rpcserver == nil { // start rpc server log.Infof("Starting rpc server...\n") var err error r.rpcserver, err = NewRPCServer(r.etcdRegistry, newEngine.PublicIP) if err != nil { log.Fatalf("Unable to create rpc server %+v", err) } go func() { errc := make(chan error, 1) if errc <- r.rpcserver.Start(); <-errc != nil { log.Fatalf("Failed to serve gRPC requests on listener: %v", <-errc) } }() } } if newEngine.Capabilities.Has(machine.CapGRPC) { if r.rpcRegistry != nil && r.rpcRegistry.IsRegistryReady() { log.Infof("Reusing gRPC engine, connection is READY\n") r.currentRegistry = r.rpcRegistry } else { log.Infof("New engine supports gRPC, connecting\n") r.rpcRegistry = NewRPCRegistry(r.rpcDialer) // connect to rpc registry r.rpcRegistry.Connect() r.currentRegistry = r.rpcRegistry } } else { log.Infof("Falling back to etcd registry\n") if r.rpcserver != nil { // If the engine changed to a non gRPC leader, we need to stop the server r.rpcserver.Stop() } r.currentRegistry = r.etcdRegistry } } else { log.Infof("Falling back to etcd registry\n") r.currentRegistry = r.etcdRegistry } }
func (m *systemdUnitManager) writeUnit(name string, contents string) error { bContents := []byte(contents) log.Infof("Writing systemd unit %s (%db)", name, len(bContents)) ufPath := m.getUnitFilePath(name) err := ioutil.WriteFile(ufPath, bContents, os.FileMode(0644)) if err != nil { return err } _, err = m.systemd.LinkUnitFiles([]string{ufPath}, true, true) return err }
// Supervise monitors the life of the Server and coordinates its shutdown. // A shutdown occurs when the monitor returns, either because a health check // fails or a user triggers a shutdown. If the shutdown is due to a health // check failure, the Server is restarted. Supervise will block shutdown until // all components have finished shutting down or a timeout occurs; if this // happens, the Server will not automatically be restarted. func (s *Server) Supervise() { sd, err := s.mon.Monitor(s.hrt, s.killc) if sd { log.Infof("Server monitor triggered: told to shut down") } else { log.Errorf("Server monitor triggered: %v", err) } close(s.stopc) done := make(chan struct{}) go func() { s.wg.Wait() close(done) }() select { case <-done: case <-time.After(shutdownTimeout): log.Errorf("Timed out waiting for server to shut down") sd = true } if !sd { log.Infof("Restarting server") s.Run() } }
func rpcRenewLeadership(lManager lease.Manager, l lease.Lease, ver int, ttl time.Duration) lease.Lease { err := l.Renew(ttl) if err != nil { if eerr, ok := err.(*etcdErr.Error); ok && eerr.ErrorCode == etcdErr.EcodeKeyNotFound { log.Errorf("Retry renew etcd operation that failed due to %v", err) l, err = lManager.AcquireLease(engineLeaseName, l.MachineID(), ver, ttl) if err != nil { log.Errorf("Engine leadership re-acquisition failed: %v", err) return nil } else if l == nil { log.Infof("Unable to re-acquire engine leadership") return nil } log.Infof("Engine leadership re-acquired") return l } else { log.Errorf("Engine leadership lost, renewal failed: %v", err) return nil } } log.Debugf("Engine leadership renewed") return l }
func doTask(t *task, e *Engine) (err error) { switch t.Type { case taskTypeUnscheduleUnit: err = e.unscheduleUnit(t.JobName, t.MachineID) case taskTypeAttemptScheduleUnit: e.attemptScheduleUnit(t.JobName, t.MachineID) default: err = fmt.Errorf("unrecognized task type %q", t.Type) } if err == nil { log.Infof("EngineReconciler completed task: %s", t) } return }
func rpcAcquireLeadership(reg registry.Registry, lManager lease.Manager, machID string, ver int, ttl time.Duration) lease.Lease { existing, err := lManager.GetLease(engineLeaseName) if err != nil { log.Errorf("Unable to determine current lease: %v", err) return nil } var l lease.Lease if (existing == nil && reg.UseEtcdRegistry()) || (existing == nil && !reg.IsRegistryReady()) { l, err = lManager.AcquireLease(engineLeaseName, machID, ver, ttl) if err != nil { log.Errorf("Engine leadership acquisition failed: %v", err) return nil } else if l == nil { log.Infof("Unable to acquire engine leadership") return nil } log.Infof("Engine leadership acquired") return l } if existing != nil && existing.Version() >= ver { log.Debugf("Lease already held by Machine(%s) operating at acceptable version %d", existing.MachineID(), existing.Version()) return existing } // TODO(hector): Here we could add a possible SLA to determine when the leader // is too busy. In such a case, we can trigger a new leader election if (existing != nil && reg.UseEtcdRegistry()) || (existing != nil && !reg.IsRegistryReady()) { rem := existing.TimeRemaining() l, err = lManager.StealLease(engineLeaseName, machID, ver, ttl+rem, existing.Index()) if err != nil { log.Errorf("Engine leadership steal failed: %v", err) return nil } else if l == nil { log.Infof("Unable to steal engine leadership") return nil } log.Infof("Stole engine leadership from Machine(%s)", existing.MachineID()) if rem > 0 { log.Infof("Waiting %v for previous lease to expire before continuing reconciliation", rem) <-time.After(rem) } return l } log.Infof("Engine leader is BUSY!") return existing }
func NewRPCServer(reg registry.Registry, addr string) (*rpcserver, error) { s := &rpcserver{ etcdRegistry: reg, mu: new(sync.Mutex), localRegistry: newInmemoryRegistry(), stop: make(chan struct{}), } var err error tcpAddr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("%s:%d", addr, rpcServerPort)) if err != nil { return nil, err } for it := 0; it < bindAddrMaxRetry; it++ { s.listener, err = net.ListenTCP("tcp", tcpAddr) if err == nil { break } log.Infof("Retrying %d to bind %s address... %v", it, tcpAddr, err) time.Sleep(bindRetryTimeout) } if err != nil { return nil, err } s.grpcserver = grpc.NewServer() s.localRegistry.LoadFrom(s.etcdRegistry) pb.RegisterRegistryServer(s.grpcserver, s) s.SetServingStatus(pb.HealthCheckResponse_NOT_SERVING) machineStates, err := s.etcdRegistry.Machines() if err != nil { return nil, err } s.hasNonGRPCAgents = false for _, state := range machineStates { if !state.Capabilities.Has(machine.CapGRPC) { log.Info("Fleet cluster has non gRPC agents!. Enabled unit state storage into etcd!") s.hasNonGRPCAgents = true break } } return s, nil }
func (r *RegistryMux) rpcDialer(_ string, timeout time.Duration) (net.Conn, error) { ticker := time.Tick(dialRegistryReconnectTimeout) alert := time.After(timeout) for { select { case <-alert: log.Errorf("Unable to connect to engine %s\n", r.currentEngine.PublicIP) return nil, errors.New("Unable to connect to new engine, the client connection is closing") case <-ticker: addr := fmt.Sprintf("%s:%d", r.currentEngine.PublicIP, rpcServerPort) conn, err := net.Dial("tcp", addr) if err == nil { log.Infof("Connected to engine on %s\n", r.currentEngine.PublicIP) return conn, nil } log.Errorf("Retry to connect to new engine: %+v", err) } } }
func ensureEngineVersionMatch(cReg registry.ClusterRegistry, expect int) bool { v, err := cReg.EngineVersion() if err != nil { log.Errorf("Unable to determine cluster engine version") return false } if v < expect { err = cReg.UpdateEngineVersion(v, expect) if err != nil { log.Errorf("Failed updating cluster engine version from %d to %d: %v", v, expect, err) return false } log.Infof("Updated cluster engine version from %d to %d", v, expect) } else if v > expect { log.Debugf("Cluster engine version higher than local engine version (%d > %d), unable to participate", v, expect) return false } return true }