func filterInvalidPods(pods []*api.Pod, source string, recorder record.EventRecorder) (filtered []*api.Pod) { names := sets.String{} for i, pod := range pods { var errlist field.ErrorList if errs := validation.ValidatePod(pod); len(errs) != 0 { errlist = append(errlist, errs...) // If validation fails, don't trust it any further - // even Name could be bad. } else { name := kubecontainer.GetPodFullName(pod) if names.Has(name) { // TODO: when validation becomes versioned, this gets a bit // more complicated. errlist = append(errlist, field.Duplicate(field.NewPath("metadata", "name"), pod.Name)) } else { names.Insert(name) } } if len(errlist) > 0 { name := bestPodIdentString(pod) err := errlist.ToAggregate() glog.Warningf("Pod[%d] (%s) from %s failed validation, ignoring: %v", i+1, name, source, err) recorder.Eventf(pod, api.EventTypeWarning, kubecontainer.FailedValidation, "Error validating pod %s from %s, ignoring: %v", name, source, err) continue } filtered = append(filtered, pod) } return }
func filterInvalidPods(pods []*api.Pod, source string, recorder record.EventRecorder) (filtered []*api.Pod) { names := sets.String{} for i, pod := range pods { var errlist []error if errs := validation.ValidatePod(pod); len(errs) != 0 { errlist = append(errlist, errs...) // If validation fails, don't trust it any further - // even Name could be bad. } else { name := kubecontainer.GetPodFullName(pod) if names.Has(name) { errlist = append(errlist, fielderrors.NewFieldDuplicate("name", pod.Name)) } else { names.Insert(name) } } if len(errlist) > 0 { name := bestPodIdentString(pod) err := utilerrors.NewAggregate(errlist) glog.Warningf("Pod[%d] (%s) from %s failed validation, ignoring: %v", i+1, name, source, err) recorder.Eventf(pod, "FailedValidation", "Error validating pod %s from %s, ignoring: %v", name, source, err) continue } filtered = append(filtered, pod) } return }
func recordNodeEvent(recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) { ref := &api.ObjectReference{ Kind: "Node", Name: nodeName, UID: types.UID(nodeUID), Namespace: "", } glog.V(2).Infof("Recording %s event message for node %s", event, nodeName) recorder.Eventf(ref, eventtype, reason, "Node %s event: %s", nodeName, event) }
func recordNodeStatusChange(recorder record.EventRecorder, node *api.Node, new_status string) { ref := &api.ObjectReference{ Kind: "Node", Name: node.Name, UID: node.UID, Namespace: "", } glog.V(2).Infof("Recording status change %s event message for node %s", new_status, node.Name) // TODO: This requires a transaction, either both node status is updated // and event is recorded or neither should happen, see issue #6055. recorder.Eventf(ref, api.EventTypeNormal, new_status, "Node %s status is now: %s", node.Name, new_status) }
// deletePods will delete all pods from master running on given node, and return true // if any pods were deleted, or were found pending deletion. func deletePods(kubeClient clientset.Interface, recorder record.EventRecorder, nodeName, nodeUID string, daemonStore cache.StoreToDaemonSetLister) (bool, error) { remaining := false selector := fields.OneTermEqualSelector(api.PodHostField, nodeName) options := api.ListOptions{FieldSelector: selector} pods, err := kubeClient.Core().Pods(api.NamespaceAll).List(options) var updateErrList []error if err != nil { return remaining, err } if len(pods.Items) > 0 { recordNodeEvent(recorder, nodeName, nodeUID, api.EventTypeNormal, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeName)) } for _, pod := range pods.Items { // Defensive check, also needed for tests. if pod.Spec.NodeName != nodeName { continue } // Set reason and message in the pod object. if _, err = setPodTerminationReason(kubeClient, &pod, nodeName); err != nil { if errors.IsConflict(err) { updateErrList = append(updateErrList, fmt.Errorf("update status failed for pod %q: %v", format.Pod(&pod), err)) continue } } // if the pod has already been marked for deletion, we still return true that there are remaining pods. if pod.DeletionGracePeriodSeconds != nil { remaining = true continue } // if the pod is managed by a daemonset, ignore it _, err := daemonStore.GetPodDaemonSets(&pod) if err == nil { // No error means at least one daemonset was found continue } glog.V(2).Infof("Starting deletion of pod %v", pod.Name) recorder.Eventf(&pod, api.EventTypeNormal, "NodeControllerEviction", "Marking for deletion Pod %s from Node %s", pod.Name, nodeName) if err := kubeClient.Core().Pods(pod.Namespace).Delete(pod.Name, nil); err != nil { return false, err } remaining = true } if len(updateErrList) > 0 { return false, utilerrors.NewAggregate(updateErrList) } return remaining, nil }
// Removes the given node from cloud provider. No extra pre-deletion actions are executed on // the Kubernetes side. func deleteNodeFromCloudProvider(node *apiv1.Node, cloudProvider cloudprovider.CloudProvider, recorder kube_record.EventRecorder) error { nodeGroup, err := cloudProvider.NodeGroupForNode(node) if err != nil { return fmt.Errorf("failed to node group for %s: %v", node.Name, err) } if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { return fmt.Errorf("picked node that doesn't belong to a node group: %s", node.Name) } if err = nodeGroup.DeleteNodes([]*apiv1.Node{node}); err != nil { return fmt.Errorf("failed to delete %s: %v", node.Name, err) } recorder.Eventf(node, apiv1.EventTypeNormal, "ScaleDown", "node removed by cluster autoscaler") return nil }
// killPodNow returns a KillPodFunc that can be used to kill a pod. // It is intended to be injected into other modules that need to kill a pod. func killPodNow(podWorkers PodWorkers, recorder record.EventRecorder) eviction.KillPodFunc { return func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error { // determine the grace period to use when killing the pod gracePeriod := int64(0) if gracePeriodOverride != nil { gracePeriod = *gracePeriodOverride } else if pod.Spec.TerminationGracePeriodSeconds != nil { gracePeriod = *pod.Spec.TerminationGracePeriodSeconds } // we timeout and return an error if we don't get a callback within a reasonable time. // the default timeout is relative to the grace period (we settle on 2s to wait for kubelet->runtime traffic to complete in sigkill) timeout := int64(gracePeriod + (gracePeriod / 2)) minTimeout := int64(2) if timeout < minTimeout { timeout = minTimeout } timeoutDuration := time.Duration(timeout) * time.Second // open a channel we block against until we get a result type response struct { err error } ch := make(chan response) podWorkers.UpdatePod(&UpdatePodOptions{ Pod: pod, UpdateType: kubetypes.SyncPodKill, OnCompleteFunc: func(err error) { ch <- response{err: err} }, KillPodOptions: &KillPodOptions{ PodStatusFunc: func(p *v1.Pod, podStatus *kubecontainer.PodStatus) v1.PodStatus { return status }, PodTerminationGracePeriodSecondsOverride: gracePeriodOverride, }, }) // wait for either a response, or a timeout select { case r := <-ch: return r.err case <-time.After(timeoutDuration): recorder.Eventf(pod, v1.EventTypeWarning, events.ExceededGracePeriod, "Container runtime did not kill the pod within specified grace period.") return fmt.Errorf("timeout waiting to kill pod") } } }
// The caller of this function must remove the taint if this function returns error. func prepareNodeForPod(client *kube_client.Client, recorder kube_record.EventRecorder, predicateChecker *ca_simulator.PredicateChecker, originalNode *kube_api.Node, criticalPod *kube_api.Pod) error { // Operate on a copy of the node to ensure pods running on the node will pass CheckPredicates below. node, err := copyNode(originalNode) if err != nil { return fmt.Errorf("Error while copying node: %v", err) } err = addTaint(client, originalNode, podId(criticalPod)) if err != nil { return fmt.Errorf("Error while adding taint: %v", err) } requiredPods, otherPods, err := groupPods(client, node) if err != nil { return err } nodeInfo := schedulercache.NewNodeInfo(requiredPods...) nodeInfo.SetNode(node) // check whether critical pod still fit if err := predicateChecker.CheckPredicates(criticalPod, nodeInfo); err != nil { return fmt.Errorf("Pod %s doesn't fit to node %v: %v", podId(criticalPod), node.Name, err) } requiredPods = append(requiredPods, criticalPod) nodeInfo = schedulercache.NewNodeInfo(requiredPods...) nodeInfo.SetNode(node) for _, p := range otherPods { if err := predicateChecker.CheckPredicates(p, nodeInfo); err != nil { glog.Infof("Pod %s will be deleted in order to schedule critical pod %s.", podId(p), podId(criticalPod)) recorder.Eventf(p, kube_api.EventTypeNormal, "DeletedByRescheduler", "Deleted by rescheduler in order to schedule critical pod %s.", podId(criticalPod)) // TODO(piosz): add better support of graceful deletion delErr := client.Pods(p.Namespace).Delete(p.Name, kube_api.NewDeleteOptions(10)) if delErr != nil { return fmt.Errorf("Failed to delete pod %s: %v", podId(p), delErr) } } else { newPods := append(nodeInfo.Pods(), p) nodeInfo = schedulercache.NewNodeInfo(newPods...) nodeInfo.SetNode(node) } } // TODO(piosz): how to reset scheduler backoff? return nil }
// Performs drain logic on the node. Marks the node as unschedulable and later removes all pods, giving // them up to MaxGracefulTerminationTime to finish. func drainNode(node *apiv1.Node, pods []*apiv1.Pod, client kube_client.Interface, recorder kube_record.EventRecorder, maxGratefulTerminationSec int) error { if err := markToBeDeleted(node, client, recorder); err != nil { return err } maxGraceful64 := int64(maxGratefulTerminationSec) for _, pod := range pods { recorder.Eventf(pod, apiv1.EventTypeNormal, "ScaleDown", "deleting pod for node scale down") err := client.Core().Pods(pod.Namespace).Delete(pod.Name, &apiv1.DeleteOptions{ GracePeriodSeconds: &maxGraceful64, }) if err != nil { glog.Errorf("Failed to delete %s/%s: %v", pod.Namespace, pod.Name, err) } } allGone := true // Wait up to MaxGracefulTerminationTime. for start := time.Now(); time.Now().Sub(start) < time.Duration(maxGratefulTerminationSec)*time.Second; time.Sleep(5 * time.Second) { allGone = true for _, pod := range pods { podreturned, err := client.Core().Pods(pod.Namespace).Get(pod.Name) if err == nil { glog.Errorf("Not deleted yet %v", podreturned) allGone = false break } if !errors.IsNotFound(err) { glog.Errorf("Failed to check pod %s/%s: %v", pod.Namespace, pod.Name, err) allGone = false } } if allGone { glog.V(1).Infof("All pods removed from %s", node.Name) break } } if !allGone { glog.Warningf("Not all pods were removed from %s, proceeding anyway", node.Name) } return nil }
// Sets unschedulable=true and adds an annotation. func markToBeDeleted(node *apiv1.Node, client kube_client.Interface, recorder kube_record.EventRecorder) error { // Get the newest version of the node. freshNode, err := client.Core().Nodes().Get(node.Name) if err != nil || freshNode == nil { return fmt.Errorf("failed to get node %v: %v", node.Name, err) } added, err := addToBeDeletedTaint(freshNode) if added == false { return err } _, err = client.Core().Nodes().Update(freshNode) if err != nil { glog.Warningf("Error while adding taints on node %v: %v", node.Name, err) return err } glog.V(1).Infof("Successfully added toBeDeletedTaint on node %v", node.Name) recorder.Eventf(node, apiv1.EventTypeNormal, "ScaleDown", "marking the node as unschedulable") return nil }
// cleanToBeDeleted clean ToBeDeleted taints. func cleanToBeDeleted(nodes []*apiv1.Node, client kube_client.Interface, recorder kube_record.EventRecorder) error { for _, node := range nodes { taints, err := apiv1.GetTaintsFromNodeAnnotations(node.Annotations) if err != nil { glog.Warningf("Error while getting Taints for node %v: %v", node.Name, err) continue } newTaints := make([]apiv1.Taint, 0) for _, taint := range taints { if taint.Key == ToBeDeletedTaint { glog.Infof("Releasing taint %+v on node %v", taint, node.Name) } else { newTaints = append(newTaints, taint) } } if len(newTaints) != len(taints) { taintsJson, err := json.Marshal(newTaints) if err != nil { glog.Warningf("Error while releasing taints on node %v: %v", node.Name, err) continue } if node.Annotations == nil { node.Annotations = make(map[string]string) } node.Annotations[apiv1.TaintsAnnotationKey] = string(taintsJson) _, err = client.Core().Nodes().Update(node) if err != nil { glog.Warningf("Error while releasing taints on node %v: %v", node.Name, err) } else { glog.V(1).Infof("Successfully released toBeDeletedTaint on node %v", node.Name) recorder.Eventf(node, apiv1.EventTypeNormal, "ClusterAutoscalerCleanup", "marking the node as schedulable") } } } return nil }
// deletePods will delete all pods from master running on given node, and return true // if any pods were deleted. func deletePods(kubeClient clientset.Interface, recorder record.EventRecorder, nodeName, nodeUID string, daemonStore cache.StoreToDaemonSetLister) (bool, error) { remaining := false selector := fields.OneTermEqualSelector(api.PodHostField, nodeName) options := api.ListOptions{FieldSelector: selector} pods, err := kubeClient.Core().Pods(api.NamespaceAll).List(options) if err != nil { return remaining, err } if len(pods.Items) > 0 { recordNodeEvent(recorder, nodeName, nodeUID, api.EventTypeNormal, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeName)) } for _, pod := range pods.Items { // Defensive check, also needed for tests. if pod.Spec.NodeName != nodeName { continue } // if the pod has already been deleted, ignore it if pod.DeletionGracePeriodSeconds != nil { continue } // if the pod is managed by a daemonset, ignore it _, err := daemonStore.GetPodDaemonSets(&pod) if err == nil { // No error means at least one daemonset was found continue } glog.V(2).Infof("Starting deletion of pod %v", pod.Name) recorder.Eventf(&pod, api.EventTypeNormal, "NodeControllerEviction", "Marking for deletion Pod %s from Node %s", pod.Name, nodeName) if err := kubeClient.Core().Pods(pod.Namespace).Delete(pod.Name, nil); err != nil { return false, err } remaining = true } return remaining, nil }
// SyncOne reconciles a CronJob with a list of any Jobs that it created. // All known jobs created by "sj" should be included in "js". // The current time is passed in to facilitate testing. // It has no receiver, to facilitate testing. func SyncOne(sj batch.CronJob, js []batch.Job, now time.Time, jc jobControlInterface, sjc sjControlInterface, pc podControlInterface, recorder record.EventRecorder) { nameForLog := fmt.Sprintf("%s/%s", sj.Namespace, sj.Name) for i := range js { j := js[i] found := inActiveList(sj, j.ObjectMeta.UID) if !found && !job.IsJobFinished(&j) { recorder.Eventf(&sj, api.EventTypeWarning, "UnexpectedJob", "Saw a job that the controller did not create or forgot: %v", j.Name) // We found an unfinished job that has us as the parent, but it is not in our Active list. // This could happen if we crashed right after creating the Job and before updating the status, // or if our jobs list is newer than our sj status after a relist, or if someone intentionally created // a job that they wanted us to adopt. // TODO: maybe handle the adoption case? Concurrency/suspend rules will not apply in that case, obviously, since we can't // stop users from creating jobs if they have permission. It is assumed that if a // user has permission to create a job within a namespace, then they have permission to make any scheduledJob // in the same namespace "adopt" that job. ReplicaSets and their Pods work the same way. // TBS: how to update sj.Status.LastScheduleTime if the adopted job is newer than any we knew about? } else if found && job.IsJobFinished(&j) { deleteFromActiveList(&sj, j.ObjectMeta.UID) // TODO: event to call out failure vs success. recorder.Eventf(&sj, api.EventTypeNormal, "SawCompletedJob", "Saw completed job: %v", j.Name) } } updatedSJ, err := sjc.UpdateStatus(&sj) if err != nil { glog.Errorf("Unable to update status for %s (rv = %s): %v", nameForLog, sj.ResourceVersion, err) } sj = *updatedSJ if sj.Spec.Suspend != nil && *sj.Spec.Suspend { glog.V(4).Infof("Not starting job for %s because it is suspended", nameForLog) return } times, err := getRecentUnmetScheduleTimes(sj, now) if err != nil { glog.Errorf("Cannot determine if %s needs to be started: %v", nameForLog, err) } // TODO: handle multiple unmet start times, from oldest to newest, updating status as needed. if len(times) == 0 { glog.V(4).Infof("No unmet start times for %s", nameForLog) return } if len(times) > 1 { glog.V(4).Infof("Multiple unmet start times for %s so only starting last one", nameForLog) } scheduledTime := times[len(times)-1] tooLate := false if sj.Spec.StartingDeadlineSeconds != nil { tooLate = scheduledTime.Add(time.Second * time.Duration(*sj.Spec.StartingDeadlineSeconds)).Before(now) } if tooLate { glog.V(4).Infof("Missed starting window for %s", nameForLog) // TODO: generate an event for a miss. Use a warning level event because it indicates a // problem with the controller (restart or long queue), and is not expected by user either. // Since we don't set LastScheduleTime when not scheduling, we are going to keep noticing // the miss every cycle. In order to avoid sending multiple events, and to avoid processing // the sj again and again, we could set a Status.LastMissedTime when we notice a miss. // Then, when we call getRecentUnmetScheduleTimes, we can take max(creationTimestamp, // Status.LastScheduleTime, Status.LastMissedTime), and then so we won't generate // and event the next time we process it, and also so the user looking at the status // can see easily that there was a missed execution. return } if sj.Spec.ConcurrencyPolicy == batch.ForbidConcurrent && len(sj.Status.Active) > 0 { // Regardless which source of information we use for the set of active jobs, // there is some risk that we won't see an active job when there is one. // (because we haven't seen the status update to the SJ or the created pod). // So it is theoretically possible to have concurrency with Forbid. // As long the as the invokations are "far enough apart in time", this usually won't happen. // // TODO: for Forbid, we could use the same name for every execution, as a lock. // With replace, we could use a name that is deterministic per execution time. // But that would mean that you could not inspect prior successes or failures of Forbid jobs. glog.V(4).Infof("Not starting job for %s because of prior execution still running and concurrency policy is Forbid", nameForLog) return } if sj.Spec.ConcurrencyPolicy == batch.ReplaceConcurrent { for _, j := range sj.Status.Active { // TODO: this should be replaced with server side job deletion // currently this mimics JobReaper from pkg/kubectl/stop.go glog.V(4).Infof("Deleting job %s of %s that was still running at next scheduled start time", j.Name, nameForLog) job, err := jc.GetJob(j.Namespace, j.Name) if err != nil { recorder.Eventf(&sj, api.EventTypeWarning, "FailedGet", "Get job: %v", err) return } // scale job down to 0 if *job.Spec.Parallelism != 0 { zero := int32(0) job.Spec.Parallelism = &zero job, err = jc.UpdateJob(job.Namespace, job) if err != nil { recorder.Eventf(&sj, api.EventTypeWarning, "FailedUpdate", "Update job: %v", err) return } } // remove all pods... selector, _ := unversioned.LabelSelectorAsSelector(job.Spec.Selector) options := api.ListOptions{LabelSelector: selector} podList, err := pc.ListPods(job.Namespace, options) if err != nil { recorder.Eventf(&sj, api.EventTypeWarning, "FailedList", "List job-pods: %v", err) } errList := []error{} for _, pod := range podList.Items { glog.V(2).Infof("CronJob controller is deleting Pod %v/%v", pod.Namespace, pod.Name) if err := pc.DeletePod(pod.Namespace, pod.Name); err != nil { // ignores the error when the pod isn't found if !errors.IsNotFound(err) { errList = append(errList, err) } } } if len(errList) != 0 { recorder.Eventf(&sj, api.EventTypeWarning, "FailedDelete", "Deleted job-pods: %v", utilerrors.NewAggregate(errList)) return } // ... the job itself... if err := jc.DeleteJob(job.Namespace, job.Name); err != nil { recorder.Eventf(&sj, api.EventTypeWarning, "FailedDelete", "Deleted job: %v", err) glog.Errorf("Error deleting job %s from %s: %v", job.Name, nameForLog, err) return } // ... and its reference from active list deleteFromActiveList(&sj, job.ObjectMeta.UID) recorder.Eventf(&sj, api.EventTypeNormal, "SuccessfulDelete", "Deleted job %v", j.Name) } } jobReq, err := getJobFromTemplate(&sj, scheduledTime) if err != nil { glog.Errorf("Unable to make Job from template in %s: %v", nameForLog, err) return } jobResp, err := jc.CreateJob(sj.Namespace, jobReq) if err != nil { recorder.Eventf(&sj, api.EventTypeWarning, "FailedCreate", "Error creating job: %v", err) return } glog.V(4).Infof("Created Job %s for %s", jobResp.Name, nameForLog) recorder.Eventf(&sj, api.EventTypeNormal, "SuccessfulCreate", "Created job %v", jobResp.Name) // ------------------------------------------------------------------ // // If this process restarts at this point (after posting a job, but // before updating the status), then we might try to start the job on // the next time. Actually, if we relist the SJs and Jobs on the next // iteration of SyncAll, we might not see our own status update, and // then post one again. So, we need to use the job name as a lock to // prevent us from making the job twice (name the job with hash of its // scheduled time). // Add the just-started job to the status list. ref, err := getRef(jobResp) if err != nil { glog.V(2).Infof("Unable to make object reference for job for %s", nameForLog) } else { sj.Status.Active = append(sj.Status.Active, *ref) } sj.Status.LastScheduleTime = &unversioned.Time{Time: scheduledTime} if _, err := sjc.UpdateStatus(&sj); err != nil { glog.Infof("Unable to update status for %s (rv = %s): %v", nameForLog, sj.ResourceVersion, err) } return }
// ScaleUp tries to scale the cluster up. Return true if it found a way to increase the size, // false if it didn't and error if an error occured. Assumes that all nodes in the cluster are // ready and in sync with instance groups. func ScaleUp(unschedulablePods []*kube_api.Pod, nodes []*kube_api.Node, cloudProvider cloudprovider.CloudProvider, kubeClient *kube_client.Client, predicateChecker *simulator.PredicateChecker, recorder kube_record.EventRecorder, maxNodesTotal int, estimatorName string) (bool, error) { // From now on we only care about unschedulable pods that were marked after the newest // node became available for the scheduler. if len(unschedulablePods) == 0 { glog.V(1).Info("No unschedulable pods") return false, nil } for _, pod := range unschedulablePods { glog.V(1).Infof("Pod %s/%s is unschedulable", pod.Namespace, pod.Name) } expansionOptions := make([]ExpansionOption, 0) nodeInfos, err := GetNodeInfosForGroups(nodes, cloudProvider, kubeClient) if err != nil { return false, fmt.Errorf("failed to build node infos for node groups: %v", err) } podsRemainUnshedulable := make(map[*kube_api.Pod]struct{}) for _, nodeGroup := range cloudProvider.NodeGroups() { currentSize, err := nodeGroup.TargetSize() if err != nil { glog.Errorf("Failed to get node group size: %v", err) continue } if currentSize >= nodeGroup.MaxSize() { // skip this node group. glog.V(4).Infof("Skipping node group %s - max size reached", nodeGroup.Id()) continue } option := ExpansionOption{ nodeGroup: nodeGroup, pods: make([]*kube_api.Pod, 0), } nodeInfo, found := nodeInfos[nodeGroup.Id()] if !found { glog.Errorf("No node info for: %s", nodeGroup.Id()) continue } for _, pod := range unschedulablePods { err = predicateChecker.CheckPredicates(pod, nodeInfo) if err == nil { option.pods = append(option.pods, pod) } else { glog.V(2).Infof("Scale-up predicate failed: %v", err) podsRemainUnshedulable[pod] = struct{}{} } } if len(option.pods) > 0 { if estimatorName == BinpackingEstimatorName { binpackingEstimator := estimator.NewBinpackingNodeEstimator(predicateChecker) option.nodeCount = binpackingEstimator.Estimate(option.pods, nodeInfo) } else if estimatorName == BasicEstimatorName { basicEstimator := estimator.NewBasicNodeEstimator() for _, pod := range option.pods { basicEstimator.Add(pod) } option.nodeCount, option.debug = basicEstimator.Estimate(nodeInfo.Node()) } else { glog.Fatalf("Unrecognized estimator: %s", estimatorName) } expansionOptions = append(expansionOptions, option) } } // Pick some expansion option. bestOption := BestExpansionOption(expansionOptions) if bestOption != nil && bestOption.nodeCount > 0 { glog.V(1).Infof("Best option to resize: %s", bestOption.nodeGroup.Id()) if len(bestOption.debug) > 0 { glog.V(1).Info(bestOption.debug) } glog.V(1).Infof("Estimated %d nodes needed in %s", bestOption.nodeCount, bestOption.nodeGroup.Id()) currentSize, err := bestOption.nodeGroup.TargetSize() if err != nil { return false, fmt.Errorf("failed to get node group size: %v", err) } newSize := currentSize + bestOption.nodeCount if newSize >= bestOption.nodeGroup.MaxSize() { glog.V(1).Infof("Capping size to MAX (%d)", bestOption.nodeGroup.MaxSize()) newSize = bestOption.nodeGroup.MaxSize() } if maxNodesTotal > 0 && len(nodes)+(newSize-currentSize) > maxNodesTotal { glog.V(1).Infof("Capping size to max cluster total size (%d)", maxNodesTotal) newSize = maxNodesTotal - len(nodes) + currentSize if newSize < currentSize { return false, fmt.Errorf("max node total count already reached") } } glog.V(0).Infof("Scale-up: setting group %s size to %d", bestOption.nodeGroup.Id(), newSize) if err := bestOption.nodeGroup.IncreaseSize(newSize - currentSize); err != nil { return false, fmt.Errorf("failed to increase node group size: %v", err) } for _, pod := range bestOption.pods { recorder.Eventf(pod, kube_api.EventTypeNormal, "TriggeredScaleUp", "pod triggered scale-up, group: %s, sizes (current/new): %d/%d", bestOption.nodeGroup.Id(), currentSize, newSize) } return true, nil } for pod := range podsRemainUnshedulable { recorder.Event(pod, kube_api.EventTypeNormal, "NotTriggerScaleUp", "pod didn't trigger scale-up (it wouldn't fit if a new node is added)") } return false, nil }
// ScaleDown tries to scale down the cluster. It returns ScaleDownResult indicating if any node was // removed and error if such occured. func ScaleDown( nodes []*kube_api.Node, unneededNodes map[string]time.Time, unneededTime time.Duration, pods []*kube_api.Pod, cloudProvider cloudprovider.CloudProvider, client *kube_client.Client, predicateChecker *simulator.PredicateChecker, oldHints map[string]string, usageTracker *simulator.UsageTracker, recorder kube_record.EventRecorder) (ScaleDownResult, error) { now := time.Now() candidates := make([]*kube_api.Node, 0) for _, node := range nodes { if val, found := unneededNodes[node.Name]; found { glog.V(2).Infof("%s was unneeded for %s", node.Name, now.Sub(val).String()) // Check how long the node was underutilized. if !val.Add(unneededTime).Before(now) { continue } nodeGroup, err := cloudProvider.NodeGroupForNode(node) if err != nil { glog.Errorf("Error while checking node group for %s: %v", node.Name, err) continue } if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { glog.V(4).Infof("Skipping %s - no node group config", node.Name) continue } size, err := nodeGroup.TargetSize() if err != nil { glog.Errorf("Error while checking node group size %s: %v", nodeGroup.Id(), err) continue } if size <= nodeGroup.MinSize() { glog.V(1).Infof("Skipping %s - node group min size reached", node.Name) continue } candidates = append(candidates, node) } } if len(candidates) == 0 { glog.Infof("No candidates for scale down") return ScaleDownNoUnneeded, nil } // We look for only 1 node so new hints may be incomplete. nodesToRemove, _, err := simulator.FindNodesToRemove(candidates, nodes, pods, client, predicateChecker, 1, false, oldHints, usageTracker, time.Now()) if err != nil { return ScaleDownError, fmt.Errorf("Find node to remove failed: %v", err) } if len(nodesToRemove) == 0 { glog.V(1).Infof("No node to remove") return ScaleDownNoNodeDeleted, nil } nodeToRemove := nodesToRemove[0] glog.Infof("Removing %s", nodeToRemove.Name) nodeGroup, err := cloudProvider.NodeGroupForNode(nodeToRemove) if err != nil { return ScaleDownError, fmt.Errorf("failed to node group for %s: %v", nodeToRemove.Name, err) } if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { return ScaleDownError, fmt.Errorf("picked node that doesn't belong to a node group: %s", nodeToRemove.Name) } err = nodeGroup.DeleteNodes([]*kube_api.Node{nodeToRemove}) simulator.RemoveNodeFromTracker(usageTracker, nodeToRemove.Name, unneededNodes) if err != nil { return ScaleDownError, fmt.Errorf("Failed to delete %s: %v", nodeToRemove.Name, err) } recorder.Eventf(nodeToRemove, kube_api.EventTypeNormal, "ScaleDown", "node removed by cluster autoscaler") return ScaleDownNodeDeleted, nil }
// ScaleUp tries to scale the cluster up. Return true if it found a way to increase the size, // false if it didn't and error if an error occured. func ScaleUp(unschedulablePods []*kube_api.Pod, nodes []*kube_api.Node, migConfigs []*config.MigConfig, gceManager *gce.GceManager, kubeClient *kube_client.Client, predicateChecker *simulator.PredicateChecker, recorder kube_record.EventRecorder) (bool, error) { // From now on we only care about unschedulable pods that were marked after the newest // node became available for the scheduler. if len(unschedulablePods) == 0 { glog.V(1).Info("No unschedulable pods") return false, nil } for _, pod := range unschedulablePods { glog.V(1).Infof("Pod %s/%s is unschedulable", pod.Namespace, pod.Name) } expansionOptions := make([]ExpansionOption, 0) nodeInfos, err := GetNodeInfosForMigs(nodes, gceManager, kubeClient) if err != nil { return false, fmt.Errorf("failed to build node infors for migs: %v", err) } podsRemainUnshedulable := make(map[*kube_api.Pod]struct{}) for _, migConfig := range migConfigs { currentSize, err := gceManager.GetMigSize(migConfig) if err != nil { glog.Errorf("Failed to get MIG size: %v", err) continue } if currentSize >= int64(migConfig.MaxSize) { // skip this mig. glog.V(4).Infof("Skipping MIG %s - max size reached", migConfig.Url()) continue } option := ExpansionOption{ migConfig: migConfig, estimator: estimator.NewBasicNodeEstimator(), } migHelpsSomePods := false nodeInfo, found := nodeInfos[migConfig.Url()] if !found { glog.Errorf("No node info for: %s", migConfig.Url()) continue } for _, pod := range unschedulablePods { err = predicateChecker.CheckPredicates(pod, nodeInfo) if err == nil { migHelpsSomePods = true option.estimator.Add(pod) } else { glog.V(2).Infof("Scale-up predicate failed: %v", err) podsRemainUnshedulable[pod] = struct{}{} } } if migHelpsSomePods { expansionOptions = append(expansionOptions, option) } } // Pick some expansion option. bestOption := BestExpansionOption(expansionOptions) if bestOption != nil && bestOption.estimator.GetCount() > 0 { glog.V(1).Infof("Best option to resize: %s", bestOption.migConfig.Url()) nodeInfo, found := nodeInfos[bestOption.migConfig.Url()] if !found { return false, fmt.Errorf("no sample node for: %s", bestOption.migConfig.Url()) } node := nodeInfo.Node() estimate, report := bestOption.estimator.Estimate(node) glog.V(1).Info(bestOption.estimator.GetDebug()) glog.V(1).Info(report) glog.V(1).Infof("Estimated %d nodes needed in %s", estimate, bestOption.migConfig.Url()) currentSize, err := gceManager.GetMigSize(bestOption.migConfig) if err != nil { return false, fmt.Errorf("failed to get MIG size: %v", err) } newSize := currentSize + int64(estimate) if newSize >= int64(bestOption.migConfig.MaxSize) { glog.V(1).Infof("Capping size to MAX (%d)", bestOption.migConfig.MaxSize) newSize = int64(bestOption.migConfig.MaxSize) } glog.V(1).Infof("Setting %s size to %d", bestOption.migConfig.Url(), newSize) if err := gceManager.SetMigSize(bestOption.migConfig, newSize); err != nil { return false, fmt.Errorf("failed to set MIG size: %v", err) } for pod := range bestOption.estimator.FittingPods { recorder.Eventf(pod, kube_api.EventTypeNormal, "TriggeredScaleUp", "pod triggered scale-up, mig: %s, sizes (current/new): %d/%d", bestOption.migConfig.Name, currentSize, newSize) } return true, nil } for pod := range podsRemainUnshedulable { recorder.Event(pod, kube_api.EventTypeNormal, "NotTriggerScaleUp", "pod didn't trigger scale-up (it wouldn't fit if a new node is added)") } return false, nil }