Example #1
0
func (s *EtcdScheduler) Prune() error {
	s.mut.RLock()
	defer s.mut.RUnlock()
	if s.state == Mutable {
		configuredMembers, err := rpc.MemberList(s.running)
		if err != nil {
			log.Errorf("Prune could not retrieve current member list: %s",
				err)
			return err
		} else {
			for k := range configuredMembers {
				_, present := s.running[k]
				if !present {
					_, pending := s.pending[k]
					if !pending {
						log.Warningf("Prune attempting to deconfigure unknown etcd "+
							"instance: %s", k)
						if err := rpc.RemoveInstance(s.running, k); err != nil {
							log.Errorf("Failed to remove instance: %s", err)
						} else {
							return nil
						}
					}
				}
			}
		}
	} else {
		log.Info("Prune skipping due to Immutable scheduler state.")
	}
	return nil
}
Example #2
0
func (s *EtcdScheduler) shouldLaunch(driver scheduler.SchedulerDriver) bool {
	s.mut.RLock()
	defer s.mut.RUnlock()

	if s.state != Mutable {
		log.Infoln("Scheduler is not mutable.  Not launching a task.")
		return false
	}

	if atomic.LoadInt32(&s.reseeding) == reseedUnderway {
		log.Infoln("Currently reseeding, not launching a task.")
		return false
	}

	if len(s.pending) != 0 {
		log.Infoln("Waiting on pending task to fail or submit status. " +
			"Not launching until we hear back.")
		return false
	}

	log.V(2).Infof("running: %+v", s.running)
	if len(s.running) >= s.desiredInstanceCount {
		log.V(2).Infoln("Already running enough tasks.")
		return false
	}

	members, err := rpc.MemberList(s.running)
	if err != nil {
		log.Errorf("Failed to retrieve running member list, "+
			"rescheduling launch attempt for later: %s", err)
		return false
	}
	if len(members) == s.desiredInstanceCount {
		log.Errorf("Cluster is already configured for desired number of nodes.  " +
			"Must deconfigure any dead nodes first or we may risk livelock.")
		return false
	}

	// Ensure we can reach ZK.  This is already being done implicitly in
	// the mesos-go driver, but it's not a bad thing to be pessimistic here.
	_, err = s.reconciliationInfoFunc(
		s.ZkServers,
		s.ZkChroot,
		s.FrameworkName,
	)
	if err != nil {
		log.Errorf("Could not read reconciliation info from ZK: %#+v. "+
			"Skipping task launch.", err)
		return false
	}

	err = s.healthCheck(s.running)
	if err != nil {
		atomic.StoreUint32(&s.Stats.IsHealthy, 0)
		atomic.AddUint32(&s.Stats.ClusterLivelocks, 1)
		// If we have been unhealthy for reseedTimeout seconds, it's time to reseed.
		if s.livelockWindow != nil {
			if time.Since(*s.livelockWindow) > s.reseedTimeout {
				log.Errorf("Cluster has been livelocked for longer than %d seconds!",
					s.reseedTimeout/time.Second)
				if s.autoReseedEnabled {
					log.Warningf("Initiating reseed...")
					// Set scheduler to immutable so that shouldLaunch bails out almost
					// instantly, preventing multiple reseed events from occurring concurrently
					go s.reseedCluster(driver)
				} else {
					log.Warning("Automatic reseed disabled (--auto-reseed=false). " +
						"Doing nothing.")
				}
				return false
			}
		} else {
			now := time.Now()
			s.livelockWindow = &now
		}

		log.Errorf("Failed health check, rescheduling "+
			"launch attempt for later: %s", err)
		return false
	}
	atomic.StoreUint32(&s.Stats.IsHealthy, 1)

	// reset livelock window because we're healthy
	s.livelockWindow = nil
	return true
}