func (s *EtcdScheduler) Prune() error { s.mut.RLock() defer s.mut.RUnlock() if s.state == Mutable { configuredMembers, err := rpc.MemberList(s.running) if err != nil { log.Errorf("Prune could not retrieve current member list: %s", err) return err } else { for k := range configuredMembers { _, present := s.running[k] if !present { _, pending := s.pending[k] if !pending { log.Warningf("Prune attempting to deconfigure unknown etcd "+ "instance: %s", k) if err := rpc.RemoveInstance(s.running, k); err != nil { log.Errorf("Failed to remove instance: %s", err) } else { return nil } } } } } } else { log.Info("Prune skipping due to Immutable scheduler state.") } return nil }
func (s *EtcdScheduler) shouldLaunch(driver scheduler.SchedulerDriver) bool { s.mut.RLock() defer s.mut.RUnlock() if s.state != Mutable { log.Infoln("Scheduler is not mutable. Not launching a task.") return false } if atomic.LoadInt32(&s.reseeding) == reseedUnderway { log.Infoln("Currently reseeding, not launching a task.") return false } if len(s.pending) != 0 { log.Infoln("Waiting on pending task to fail or submit status. " + "Not launching until we hear back.") return false } log.V(2).Infof("running: %+v", s.running) if len(s.running) >= s.desiredInstanceCount { log.V(2).Infoln("Already running enough tasks.") return false } members, err := rpc.MemberList(s.running) if err != nil { log.Errorf("Failed to retrieve running member list, "+ "rescheduling launch attempt for later: %s", err) return false } if len(members) == s.desiredInstanceCount { log.Errorf("Cluster is already configured for desired number of nodes. " + "Must deconfigure any dead nodes first or we may risk livelock.") return false } // Ensure we can reach ZK. This is already being done implicitly in // the mesos-go driver, but it's not a bad thing to be pessimistic here. _, err = s.reconciliationInfoFunc( s.ZkServers, s.ZkChroot, s.FrameworkName, ) if err != nil { log.Errorf("Could not read reconciliation info from ZK: %#+v. "+ "Skipping task launch.", err) return false } err = s.healthCheck(s.running) if err != nil { atomic.StoreUint32(&s.Stats.IsHealthy, 0) atomic.AddUint32(&s.Stats.ClusterLivelocks, 1) // If we have been unhealthy for reseedTimeout seconds, it's time to reseed. if s.livelockWindow != nil { if time.Since(*s.livelockWindow) > s.reseedTimeout { log.Errorf("Cluster has been livelocked for longer than %d seconds!", s.reseedTimeout/time.Second) if s.autoReseedEnabled { log.Warningf("Initiating reseed...") // Set scheduler to immutable so that shouldLaunch bails out almost // instantly, preventing multiple reseed events from occurring concurrently go s.reseedCluster(driver) } else { log.Warning("Automatic reseed disabled (--auto-reseed=false). " + "Doing nothing.") } return false } } else { now := time.Now() s.livelockWindow = &now } log.Errorf("Failed health check, rescheduling "+ "launch attempt for later: %s", err) return false } atomic.StoreUint32(&s.Stats.IsHealthy, 1) // reset livelock window because we're healthy s.livelockWindow = nil return true }