// CheckGroupsAndNodes checks if all node groups have all required nodes. func CheckGroupsAndNodes(nodes []*kube_api.Node, cloudProvider cloudprovider.CloudProvider) error { groupCount := make(map[string]int) for _, node := range nodes { group, err := cloudProvider.NodeGroupForNode(node) if err != nil { return err } if group == nil || reflect.ValueOf(group).IsNil() { continue } id := group.Id() count, _ := groupCount[id] groupCount[id] = count + 1 } for _, nodeGroup := range cloudProvider.NodeGroups() { size, err := nodeGroup.TargetSize() if err != nil { return err } count := groupCount[nodeGroup.Id()] if size != count { return fmt.Errorf("wrong number of nodes for node group: %s expected: %d actual: %d", nodeGroup.Id(), size, count) } } return nil }
// Removes the given node from cloud provider. No extra pre-deletion actions are executed on // the Kubernetes side. func deleteNodeFromCloudProvider(node *apiv1.Node, cloudProvider cloudprovider.CloudProvider, recorder kube_record.EventRecorder) error { nodeGroup, err := cloudProvider.NodeGroupForNode(node) if err != nil { return fmt.Errorf("failed to node group for %s: %v", node.Name, err) } if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { return fmt.Errorf("picked node that doesn't belong to a node group: %s", node.Name) } if err = nodeGroup.DeleteNodes([]*apiv1.Node{node}); err != nil { return fmt.Errorf("failed to delete %s: %v", node.Name, err) } recorder.Eventf(node, apiv1.EventTypeNormal, "ScaleDown", "node removed by cluster autoscaler") return nil }
// GetNodeInfosForGroups finds NodeInfos for all node groups used to manage the given nodes. It also returns a node group to sample node mapping. // TODO(mwielgus): This returns map keyed by url, while most code (including scheduler) uses node.Name for a key. func GetNodeInfosForGroups(nodes []*kube_api.Node, cloudProvider cloudprovider.CloudProvider, kubeClient *kube_client.Client) (map[string]*schedulercache.NodeInfo, error) { result := make(map[string]*schedulercache.NodeInfo) for _, node := range nodes { nodeGroup, err := cloudProvider.NodeGroupForNode(node) if err != nil { return map[string]*schedulercache.NodeInfo{}, err } if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { continue } id := nodeGroup.Id() if _, found := result[id]; !found { nodeInfo, err := simulator.BuildNodeInfoForNode(node, kubeClient) if err != nil { return map[string]*schedulercache.NodeInfo{}, err } result[id] = nodeInfo } } return result, nil }
// This functions finds empty nodes among passed candidates and returns a list of empty nodes // that can be deleted at the same time. func getEmptyNodes(candidates []*apiv1.Node, pods []*apiv1.Pod, maxEmptyBulkDelete int, cloudProvider cloudprovider.CloudProvider) []*apiv1.Node { emptyNodes := simulator.FindEmptyNodesToRemove(candidates, pods) availabilityMap := make(map[string]int) result := make([]*apiv1.Node, 0) for _, node := range emptyNodes { nodeGroup, err := cloudProvider.NodeGroupForNode(node) if err != nil { glog.Errorf("Failed to get group for %s", node.Name) continue } if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { continue } var available int var found bool if _, found = availabilityMap[nodeGroup.Id()]; !found { size, err := nodeGroup.TargetSize() if err != nil { glog.Errorf("Failed to get size for %s: %v ", nodeGroup.Id(), err) continue } available = size - nodeGroup.MinSize() if available < 0 { available = 0 } availabilityMap[nodeGroup.Id()] = available } if available > 0 { available -= 1 availabilityMap[nodeGroup.Id()] = available result = append(result, node) } } limit := maxEmptyBulkDelete if len(result) < limit { limit = len(result) } return result[:limit] }
// ScaleUp tries to scale the cluster up. Return true if it found a way to increase the size, // false if it didn't and error if an error occured. Assumes that all nodes in the cluster are // ready and in sync with instance groups. func ScaleUp(unschedulablePods []*kube_api.Pod, nodes []*kube_api.Node, cloudProvider cloudprovider.CloudProvider, kubeClient *kube_client.Client, predicateChecker *simulator.PredicateChecker, recorder kube_record.EventRecorder, maxNodesTotal int, estimatorName string) (bool, error) { // From now on we only care about unschedulable pods that were marked after the newest // node became available for the scheduler. if len(unschedulablePods) == 0 { glog.V(1).Info("No unschedulable pods") return false, nil } for _, pod := range unschedulablePods { glog.V(1).Infof("Pod %s/%s is unschedulable", pod.Namespace, pod.Name) } expansionOptions := make([]ExpansionOption, 0) nodeInfos, err := GetNodeInfosForGroups(nodes, cloudProvider, kubeClient) if err != nil { return false, fmt.Errorf("failed to build node infos for node groups: %v", err) } podsRemainUnshedulable := make(map[*kube_api.Pod]struct{}) for _, nodeGroup := range cloudProvider.NodeGroups() { currentSize, err := nodeGroup.TargetSize() if err != nil { glog.Errorf("Failed to get node group size: %v", err) continue } if currentSize >= nodeGroup.MaxSize() { // skip this node group. glog.V(4).Infof("Skipping node group %s - max size reached", nodeGroup.Id()) continue } option := ExpansionOption{ nodeGroup: nodeGroup, pods: make([]*kube_api.Pod, 0), } nodeInfo, found := nodeInfos[nodeGroup.Id()] if !found { glog.Errorf("No node info for: %s", nodeGroup.Id()) continue } for _, pod := range unschedulablePods { err = predicateChecker.CheckPredicates(pod, nodeInfo) if err == nil { option.pods = append(option.pods, pod) } else { glog.V(2).Infof("Scale-up predicate failed: %v", err) podsRemainUnshedulable[pod] = struct{}{} } } if len(option.pods) > 0 { if estimatorName == BinpackingEstimatorName { binpackingEstimator := estimator.NewBinpackingNodeEstimator(predicateChecker) option.nodeCount = binpackingEstimator.Estimate(option.pods, nodeInfo) } else if estimatorName == BasicEstimatorName { basicEstimator := estimator.NewBasicNodeEstimator() for _, pod := range option.pods { basicEstimator.Add(pod) } option.nodeCount, option.debug = basicEstimator.Estimate(nodeInfo.Node()) } else { glog.Fatalf("Unrecognized estimator: %s", estimatorName) } expansionOptions = append(expansionOptions, option) } } // Pick some expansion option. bestOption := BestExpansionOption(expansionOptions) if bestOption != nil && bestOption.nodeCount > 0 { glog.V(1).Infof("Best option to resize: %s", bestOption.nodeGroup.Id()) if len(bestOption.debug) > 0 { glog.V(1).Info(bestOption.debug) } glog.V(1).Infof("Estimated %d nodes needed in %s", bestOption.nodeCount, bestOption.nodeGroup.Id()) currentSize, err := bestOption.nodeGroup.TargetSize() if err != nil { return false, fmt.Errorf("failed to get node group size: %v", err) } newSize := currentSize + bestOption.nodeCount if newSize >= bestOption.nodeGroup.MaxSize() { glog.V(1).Infof("Capping size to MAX (%d)", bestOption.nodeGroup.MaxSize()) newSize = bestOption.nodeGroup.MaxSize() } if maxNodesTotal > 0 && len(nodes)+(newSize-currentSize) > maxNodesTotal { glog.V(1).Infof("Capping size to max cluster total size (%d)", maxNodesTotal) newSize = maxNodesTotal - len(nodes) + currentSize if newSize < currentSize { return false, fmt.Errorf("max node total count already reached") } } glog.V(0).Infof("Scale-up: setting group %s size to %d", bestOption.nodeGroup.Id(), newSize) if err := bestOption.nodeGroup.IncreaseSize(newSize - currentSize); err != nil { return false, fmt.Errorf("failed to increase node group size: %v", err) } for _, pod := range bestOption.pods { recorder.Eventf(pod, kube_api.EventTypeNormal, "TriggeredScaleUp", "pod triggered scale-up, group: %s, sizes (current/new): %d/%d", bestOption.nodeGroup.Id(), currentSize, newSize) } return true, nil } for pod := range podsRemainUnshedulable { recorder.Event(pod, kube_api.EventTypeNormal, "NotTriggerScaleUp", "pod didn't trigger scale-up (it wouldn't fit if a new node is added)") } return false, nil }
// ScaleDown tries to scale down the cluster. It returns ScaleDownResult indicating if any node was // removed and error if such occured. func ScaleDown( nodes []*kube_api.Node, unneededNodes map[string]time.Time, unneededTime time.Duration, pods []*kube_api.Pod, cloudProvider cloudprovider.CloudProvider, client *kube_client.Client, predicateChecker *simulator.PredicateChecker, oldHints map[string]string, usageTracker *simulator.UsageTracker, recorder kube_record.EventRecorder) (ScaleDownResult, error) { now := time.Now() candidates := make([]*kube_api.Node, 0) for _, node := range nodes { if val, found := unneededNodes[node.Name]; found { glog.V(2).Infof("%s was unneeded for %s", node.Name, now.Sub(val).String()) // Check how long the node was underutilized. if !val.Add(unneededTime).Before(now) { continue } nodeGroup, err := cloudProvider.NodeGroupForNode(node) if err != nil { glog.Errorf("Error while checking node group for %s: %v", node.Name, err) continue } if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { glog.V(4).Infof("Skipping %s - no node group config", node.Name) continue } size, err := nodeGroup.TargetSize() if err != nil { glog.Errorf("Error while checking node group size %s: %v", nodeGroup.Id(), err) continue } if size <= nodeGroup.MinSize() { glog.V(1).Infof("Skipping %s - node group min size reached", node.Name) continue } candidates = append(candidates, node) } } if len(candidates) == 0 { glog.Infof("No candidates for scale down") return ScaleDownNoUnneeded, nil } // We look for only 1 node so new hints may be incomplete. nodesToRemove, _, err := simulator.FindNodesToRemove(candidates, nodes, pods, client, predicateChecker, 1, false, oldHints, usageTracker, time.Now()) if err != nil { return ScaleDownError, fmt.Errorf("Find node to remove failed: %v", err) } if len(nodesToRemove) == 0 { glog.V(1).Infof("No node to remove") return ScaleDownNoNodeDeleted, nil } nodeToRemove := nodesToRemove[0] glog.Infof("Removing %s", nodeToRemove.Name) nodeGroup, err := cloudProvider.NodeGroupForNode(nodeToRemove) if err != nil { return ScaleDownError, fmt.Errorf("failed to node group for %s: %v", nodeToRemove.Name, err) } if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { return ScaleDownError, fmt.Errorf("picked node that doesn't belong to a node group: %s", nodeToRemove.Name) } err = nodeGroup.DeleteNodes([]*kube_api.Node{nodeToRemove}) simulator.RemoveNodeFromTracker(usageTracker, nodeToRemove.Name, unneededNodes) if err != nil { return ScaleDownError, fmt.Errorf("Failed to delete %s: %v", nodeToRemove.Name, err) } recorder.Eventf(nodeToRemove, kube_api.EventTypeNormal, "ScaleDown", "node removed by cluster autoscaler") return ScaleDownNodeDeleted, nil }
// ScaleDown tries to scale down the cluster. It returns ScaleDownResult indicating if any node was // removed and error if such occured. func ScaleDown( nodes []*kube_api.Node, unneededNodes map[string]time.Time, unneededTime time.Duration, pods []*kube_api.Pod, cloudProvider cloudprovider.CloudProvider, client *kube_client.Client, predicateChecker *simulator.PredicateChecker) (ScaleDownResult, error) { now := time.Now() candidates := make([]*kube_api.Node, 0) for _, node := range nodes { if val, found := unneededNodes[node.Name]; found { glog.V(2).Infof("%s was unneeded for %s", node.Name, now.Sub(val).String()) // Check how long the node was underutilized. if !val.Add(unneededTime).Before(now) { continue } nodeGroup, err := cloudProvider.NodeGroupForNode(node) if err != nil { glog.Errorf("Error while checking node group for %s: %v", node.Name, err) continue } if nodeGroup == nil { glog.V(4).Infof("Skipping %s - no node group config", node.Name) continue } size, err := nodeGroup.TargetSize() if err != nil { glog.Errorf("Error while checking node group size %s: %v", nodeGroup.Id(), err) continue } if size <= nodeGroup.MinSize() { glog.V(1).Infof("Skipping %s - node group min size reached", node.Name) continue } candidates = append(candidates, node) } } if len(candidates) == 0 { glog.Infof("No candidates for scale down") return ScaleDownNoUnneeded, nil } nodesToRemove, err := simulator.FindNodesToRemove(candidates, nodes, pods, client, predicateChecker, 1, false) if err != nil { return ScaleDownError, fmt.Errorf("Find node to remove failed: %v", err) } if len(nodesToRemove) == 0 { glog.V(1).Infof("No node to remove") return ScaleDownNoNodeDeleted, nil } nodeToRemove := nodesToRemove[0] glog.Infof("Removing %s", nodeToRemove.Name) nodeGroup, err := cloudProvider.NodeGroupForNode(nodeToRemove) if err != nil { return ScaleDownError, fmt.Errorf("failed to node group for %s: %v", nodeToRemove.Name, err) } if nodeGroup == nil { return ScaleDownError, fmt.Errorf("picked node that doesn't belong to a node group: %s", nodeToRemove.Name) } err = nodeGroup.DeleteNodes([]*kube_api.Node{nodeToRemove}) if err != nil { return ScaleDownError, fmt.Errorf("Failed to delete %s: %v", nodeToRemove.Name, err) } return ScaleDownNodeDeleted, nil }
// ScaleUp tries to scale the cluster up. Return true if it found a way to increase the size, // false if it didn't and error if an error occured. func ScaleUp(unschedulablePods []*kube_api.Pod, nodes []*kube_api.Node, cloudProvider cloudprovider.CloudProvider, kubeClient *kube_client.Client, predicateChecker *simulator.PredicateChecker, recorder kube_record.EventRecorder) (bool, error) { // From now on we only care about unschedulable pods that were marked after the newest // node became available for the scheduler. if len(unschedulablePods) == 0 { glog.V(1).Info("No unschedulable pods") return false, nil } for _, pod := range unschedulablePods { glog.V(1).Infof("Pod %s/%s is unschedulable", pod.Namespace, pod.Name) } expansionOptions := make([]ExpansionOption, 0) nodeInfos, err := GetNodeInfosForGroups(nodes, cloudProvider, kubeClient) if err != nil { return false, fmt.Errorf("failed to build node infos for node groups: %v", err) } podsRemainUnshedulable := make(map[*kube_api.Pod]struct{}) for _, nodeGroup := range cloudProvider.NodeGroups() { currentSize, err := nodeGroup.TargetSize() if err != nil { glog.Errorf("Failed to get node group size: %v", err) continue } if currentSize >= nodeGroup.MaxSize() { // skip this node group. glog.V(4).Infof("Skipping node group %s - max size reached", nodeGroup.Id()) continue } option := ExpansionOption{ nodeGroup: nodeGroup, estimator: estimator.NewBasicNodeEstimator(), } groupHelpsSomePods := false nodeInfo, found := nodeInfos[nodeGroup.Id()] if !found { glog.Errorf("No node info for: %s", nodeGroup.Id()) continue } for _, pod := range unschedulablePods { err = predicateChecker.CheckPredicates(pod, nodeInfo) if err == nil { groupHelpsSomePods = true option.estimator.Add(pod) } else { glog.V(2).Infof("Scale-up predicate failed: %v", err) podsRemainUnshedulable[pod] = struct{}{} } } if groupHelpsSomePods { expansionOptions = append(expansionOptions, option) } } // Pick some expansion option. bestOption := BestExpansionOption(expansionOptions) if bestOption != nil && bestOption.estimator.GetCount() > 0 { glog.V(1).Infof("Best option to resize: %s", bestOption.nodeGroup.Id()) nodeInfo, found := nodeInfos[bestOption.nodeGroup.Id()] if !found { return false, fmt.Errorf("no sample node for: %s", bestOption.nodeGroup.Id()) } node := nodeInfo.Node() estimate, report := bestOption.estimator.Estimate(node) glog.V(1).Info(bestOption.estimator.GetDebug()) glog.V(1).Info(report) glog.V(1).Infof("Estimated %d nodes needed in %s", estimate, bestOption.nodeGroup.Id()) currentSize, err := bestOption.nodeGroup.TargetSize() if err != nil { return false, fmt.Errorf("failed to get node group size: %v", err) } newSize := currentSize + estimate if newSize >= bestOption.nodeGroup.MaxSize() { glog.V(1).Infof("Capping size to MAX (%d)", bestOption.nodeGroup.MaxSize()) newSize = bestOption.nodeGroup.MaxSize() } glog.V(1).Infof("Setting %s size to %d", bestOption.nodeGroup.Id(), newSize) if err := bestOption.nodeGroup.IncreaseSize(newSize - currentSize); err != nil { return false, fmt.Errorf("failed to increase node group size: %v", err) } for pod := range bestOption.estimator.FittingPods { recorder.Eventf(pod, kube_api.EventTypeNormal, "TriggeredScaleUp", "pod triggered scale-up, group: %s, sizes (current/new): %d/%d", bestOption.nodeGroup.Id(), currentSize, newSize) } return true, nil } for pod := range podsRemainUnshedulable { recorder.Event(pod, kube_api.EventTypeNormal, "NotTriggerScaleUp", "pod didn't trigger scale-up (it wouldn't fit if a new node is added)") } return false, nil }
// ScaleDown tries to scale down the cluster. It returns ScaleDownResult indicating if any node was // removed and error if such occured. func ScaleDown( nodes []*kube_api.Node, lastUtilizationMap map[string]float64, unneededNodes map[string]time.Time, unneededTime time.Duration, pods []*kube_api.Pod, cloudProvider cloudprovider.CloudProvider, client *kube_client.Client, predicateChecker *simulator.PredicateChecker, oldHints map[string]string, usageTracker *simulator.UsageTracker, recorder kube_record.EventRecorder, maxEmptyBulkDelete int) (ScaleDownResult, error) { now := time.Now() candidates := make([]*kube_api.Node, 0) for _, node := range nodes { if val, found := unneededNodes[node.Name]; found { glog.V(2).Infof("%s was unneeded for %s", node.Name, now.Sub(val).String()) // Check how long the node was underutilized. if !val.Add(unneededTime).Before(now) { continue } nodeGroup, err := cloudProvider.NodeGroupForNode(node) if err != nil { glog.Errorf("Error while checking node group for %s: %v", node.Name, err) continue } if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { glog.V(4).Infof("Skipping %s - no node group config", node.Name) continue } size, err := nodeGroup.TargetSize() if err != nil { glog.Errorf("Error while checking node group size %s: %v", nodeGroup.Id(), err) continue } if size <= nodeGroup.MinSize() { glog.V(1).Infof("Skipping %s - node group min size reached", node.Name) continue } candidates = append(candidates, node) } } if len(candidates) == 0 { glog.Infof("No candidates for scale down") return ScaleDownNoUnneeded, nil } // Trying to delete empty nodes in bulk. If there are no empty nodes then CA will // try to delete not-so-empty nodes, possibly killing some pods and allowing them // to recreate on other nodes. emptyNodes := getEmptyNodes(candidates, pods, maxEmptyBulkDelete, cloudProvider) if len(emptyNodes) > 0 { confirmation := make(chan error, len(emptyNodes)) for _, node := range emptyNodes { glog.V(0).Infof("Scale-down: removing empty node %s", node.Name) simulator.RemoveNodeFromTracker(usageTracker, node.Name, unneededNodes) go func(nodeToDelete *kube_api.Node) { confirmation <- deleteNodeFromCloudProvider(nodeToDelete, cloudProvider, recorder) }(node) } var finalError error for range emptyNodes { if err := <-confirmation; err != nil { glog.Errorf("Problem with empty node deletion: %v", err) finalError = err } } if finalError == nil { return ScaleDownNodeDeleted, nil } return ScaleDownError, fmt.Errorf("failed to delete at least one empty node: %v", finalError) } // We look for only 1 node so new hints may be incomplete. nodesToRemove, _, err := simulator.FindNodesToRemove(candidates, nodes, pods, client, predicateChecker, 1, false, oldHints, usageTracker, time.Now()) if err != nil { return ScaleDownError, fmt.Errorf("Find node to remove failed: %v", err) } if len(nodesToRemove) == 0 { glog.V(1).Infof("No node to remove") return ScaleDownNoNodeDeleted, nil } toRemove := nodesToRemove[0] utilization := lastUtilizationMap[toRemove.Node.Name] podNames := make([]string, 0, len(toRemove.PodsToReschedule)) for _, pod := range toRemove.PodsToReschedule { podNames = append(podNames, pod.Namespace+"/"+pod.Name) } glog.V(0).Infof("Scale-down: removing node %s, utilization: %v, pods to reschedule: ", toRemove.Node.Name, utilization, strings.Join(podNames, ",")) simulator.RemoveNodeFromTracker(usageTracker, toRemove.Node.Name, unneededNodes) err = deleteNodeFromCloudProvider(toRemove.Node, cloudProvider, recorder) if err != nil { return ScaleDownError, fmt.Errorf("Failed to delete %s: %v", toRemove.Node.Name, err) } return ScaleDownNodeDeleted, nil }