Beispiel #1
0
func main() {
	glog.V(0).Info("Kubernetes-Resque Worker Cleanup Service")

	kubeClient, err := getKubernetesClient()
	if err != nil {
		glog.Fatal(errors.Wrap(err, "Unable to create kubernetes client").Error())
	}
	podLister := kube_util.NewScheduledPodLister(kubeClient)
	redisClient := getRedisClient()

	ticker := time.NewTicker(time.Minute * 5)
	defer ticker.Stop()

	for _ = range ticker.C {
		glog.V(1).Infof("starting cleanup at %v...", time.Now())

		kubernetesWorkers, err := getWorkersFromKubernetes(podLister)
		if err != nil {
			glog.Warning(errors.Wrap(err, "Unable to get workers from kubernetes").Error())
			continue
		}

		redisWorkers, err := getWorkersFromRedis(redisClient)
		if err != nil {
			glog.Warning(errors.Wrap(err, "Unable to get workers from redis").Error())
			continue
		}

		deadRedisWorkers := redisWorkersNotInKubernetes(redisWorkers, kubernetesWorkers)

		glog.V(2).Infof("Found %d kubernetes workers", len(kubernetesWorkers))
		glog.V(2).Info(kubernetesWorkers)

		glog.V(2).Infof("Found %d redis workers", len(redisWorkers))
		glog.V(2).Info(redisWorkers)

		glog.V(2).Infof("Found %d dead redis workers", len(deadRedisWorkers))
		glog.V(2).Info(deadRedisWorkers)

		for _, redisWorker := range deadRedisWorkers {
			if err := removeDeadRedisWorker(redisClient, redisWorker); err != nil {
				glog.Warning(errors.Wrap(err, fmt.Sprintf("Unable to delete %s", redisWorker)).Error())
			}
		}

		failedResqueJobs, err := getDirtyExitFailedJobsFromRedis(redisClient)
		if err != nil {
			glog.Warning(errors.Wrap(err, "Unable to get failed jobs from redis").Error())
			continue
		}

		glog.V(2).Infof("Found %d failed resque jobs", len(failedResqueJobs))
		glog.V(2).Info(failedResqueJobs)

		for _, failedResqueJob := range failedResqueJobs {
			if err := removeFailedResqueJob(redisClient, failedResqueJob); err != nil {
				glog.Warning(errors.Wrap(err, "Unable to retry failed resque job").Error())
			}
		}

		glog.V(1).Infof("finished cleanup at %v...", time.Now())
	}
}
Beispiel #2
0
// In order to meet interface criteria for LeaderElectionConfig we need to
// take stop channell as an argument. However, since we are committing a suicide
// after loosing mastership we can safely ignore it.
func run(_ <-chan struct{}) {
	kubeClient := createKubeClient()

	predicateChecker, err := simulator.NewPredicateChecker(kubeClient)
	if err != nil {
		glog.Fatalf("Failed to create predicate checker: %v", err)
	}
	unschedulablePodLister := kube_util.NewUnschedulablePodLister(kubeClient, kube_api.NamespaceAll)
	scheduledPodLister := kube_util.NewScheduledPodLister(kubeClient)
	nodeLister := kube_util.NewNodeLister(kubeClient)

	lastScaleUpTime := time.Now()
	lastScaleDownFailedTrial := time.Now()
	unneededNodes := make(map[string]time.Time)
	podLocationHints := make(map[string]string)
	usageTracker := simulator.NewUsageTracker()

	recorder := createEventRecorder(kubeClient)

	var cloudProvider cloudprovider.CloudProvider

	if *cloudProviderFlag == "gce" {
		// GCE Manager
		var gceManager *gce.GceManager
		var gceError error
		if *cloudConfig != "" {
			config, fileErr := os.Open(*cloudConfig)
			if fileErr != nil {
				glog.Fatalf("Couldn't open cloud provider configuration %s: %#v", *cloudConfig, err)
			}
			defer config.Close()
			gceManager, gceError = gce.CreateGceManager(config)
		} else {
			gceManager, gceError = gce.CreateGceManager(nil)
		}
		if gceError != nil {
			glog.Fatalf("Failed to create GCE Manager: %v", err)
		}
		cloudProvider, err = gce.BuildGceCloudProvider(gceManager, nodeGroupsFlag)
		if err != nil {
			glog.Fatalf("Failed to create GCE cloud provider: %v", err)
		}
	}

	if *cloudProviderFlag == "aws" {
		var awsManager *aws.AwsManager
		var awsError error
		if *cloudConfig != "" {
			config, fileErr := os.Open(*cloudConfig)
			if fileErr != nil {
				glog.Fatalf("Couldn't open cloud provider configuration %s: %#v", *cloudConfig, err)
			}
			defer config.Close()
			awsManager, awsError = aws.CreateAwsManager(config)
		} else {
			awsManager, awsError = aws.CreateAwsManager(nil)
		}
		if awsError != nil {
			glog.Fatalf("Failed to create AWS Manager: %v", err)
		}
		cloudProvider, err = aws.BuildAwsCloudProvider(awsManager, nodeGroupsFlag)
		if err != nil {
			glog.Fatalf("Failed to create AWS cloud provider: %v", err)
		}
	}

	for {
		select {
		case <-time.After(*scanInterval):
			{
				loopStart := time.Now()
				updateLastTime("main")

				nodes, err := nodeLister.List()
				if err != nil {
					glog.Errorf("Failed to list nodes: %v", err)
					continue
				}
				if len(nodes) == 0 {
					glog.Errorf("No nodes in the cluster")
					continue
				}

				if err := CheckGroupsAndNodes(nodes, cloudProvider); err != nil {
					glog.Warningf("Cluster is not ready for autoscaling: %v", err)
					continue
				}

				allUnschedulablePods, err := unschedulablePodLister.List()
				if err != nil {
					glog.Errorf("Failed to list unscheduled pods: %v", err)
					continue
				}

				allScheduled, err := scheduledPodLister.List()
				if err != nil {
					glog.Errorf("Failed to list scheduled pods: %v", err)
					continue
				}

				// We need to reset all pods that have been marked as unschedulable not after
				// the newest node became available for the scheduler.
				allNodesAvailableTime := GetAllNodesAvailableTime(nodes)
				podsToReset, unschedulablePodsToHelp := SlicePodsByPodScheduledTime(allUnschedulablePods, allNodesAvailableTime)
				ResetPodScheduledCondition(kubeClient, podsToReset)

				// We need to check whether pods marked as unschedulable are actually unschedulable.
				// This should prevent from adding unnecessary nodes. Example of such situation:
				// - CA and Scheduler has slightly different configuration
				// - Scheduler can't schedule a pod and marks it as unschedulable
				// - CA added a node which should help the pod
				// - Scheduler doesn't schedule the pod on the new node
				//   because according to it logic it doesn't fit there
				// - CA see the pod is still unschedulable, so it adds another node to help it
				//
				// With the check enabled the last point won't happen because CA will ignore a pod
				// which is supposed to schedule on an existing node.
				//
				// Without below check cluster might be unnecessary scaled up to the max allowed size
				// in the describe situation.
				schedulablePodsPresent := false
				if *verifyUnschedulablePods {
					newUnschedulablePodsToHelp := FilterOutSchedulable(unschedulablePodsToHelp, nodes, allScheduled, predicateChecker)

					if len(newUnschedulablePodsToHelp) != len(unschedulablePodsToHelp) {
						glog.V(2).Info("Schedulable pods present")
						schedulablePodsPresent = true
					}
					unschedulablePodsToHelp = newUnschedulablePodsToHelp
				}

				if len(unschedulablePodsToHelp) == 0 {
					glog.V(1).Info("No unschedulable pods")
				} else if *maxNodesTotal > 0 && len(nodes) >= *maxNodesTotal {
					glog.V(1).Info("Max total nodes in cluster reached")
				} else {
					scaleUpStart := time.Now()
					updateLastTime("scaleup")
					scaledUp, err := ScaleUp(unschedulablePodsToHelp, nodes, cloudProvider, kubeClient, predicateChecker, recorder,
						*maxNodesTotal)

					updateDuration("scaleup", scaleUpStart)

					if err != nil {
						glog.Errorf("Failed to scale up: %v", err)
						continue
					} else {
						if scaledUp {
							lastScaleUpTime = time.Now()
							// No scale down in this iteration.
							continue
						}
					}
				}

				if *scaleDownEnabled {
					unneededStart := time.Now()

					// In dry run only utilization is updated
					calculateUnneededOnly := lastScaleUpTime.Add(*scaleDownDelay).After(time.Now()) ||
						lastScaleDownFailedTrial.Add(*scaleDownTrialInterval).After(time.Now()) ||
						schedulablePodsPresent

					glog.V(4).Infof("Scale down status: unneededOnly=%v lastScaleUpTime=%s "+
						"lastScaleDownFailedTrail=%s schedulablePodsPresent=%v", calculateUnneededOnly,
						lastScaleUpTime, lastScaleDownFailedTrial, schedulablePodsPresent)

					updateLastTime("findUnneeded")
					glog.V(4).Infof("Calculating unneded nodes")

					usageTracker.CleanUp(time.Now().Add(-(*scaleDownUnneededTime)))
					unneededNodes, podLocationHints = FindUnneededNodes(
						nodes,
						unneededNodes,
						*scaleDownUtilizationThreshold,
						allScheduled,
						predicateChecker,
						podLocationHints,
						usageTracker, time.Now())

					updateDuration("findUnneeded", unneededStart)

					for key, val := range unneededNodes {
						if glog.V(4) {
							glog.V(4).Infof("%s is unneeded since %s duration %s", key, val.String(), time.Now().Sub(val).String())
						}
					}

					if !calculateUnneededOnly {
						glog.V(4).Infof("Starting scale down")

						scaleDownStart := time.Now()
						updateLastTime("scaledown")

						result, err := ScaleDown(
							nodes,
							unneededNodes,
							*scaleDownUnneededTime,
							allScheduled,
							cloudProvider,
							kubeClient,
							predicateChecker,
							podLocationHints,
							usageTracker)

						updateDuration("scaledown", scaleDownStart)

						// TODO: revisit result handling
						if err != nil {
							glog.Errorf("Failed to scale down: %v", err)
						} else {
							if result == ScaleDownError || result == ScaleDownNoNodeDeleted {
								lastScaleDownFailedTrial = time.Now()
							}
						}
					}
				}
				updateDuration("main", loopStart)
			}
		}
	}
}