// CalculateNodeLabelPriority checks whether a particular label exists on a node or not, regardless of its value. // If presence is true, prioritizes nodes that have the specified label, regardless of value. // If presence is false, prioritizes nodes that do not have the specified label. func (n *NodeLabelPrioritizer) CalculateNodeLabelPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { var score int nodes, err := nodeLister.List() if err != nil { return nil, err } labeledNodes := map[string]bool{} for _, node := range nodes.Items { exists := labels.Set(node.Labels).Has(n.label) labeledNodes[node.Name] = (exists && n.presence) || (!exists && !n.presence) } result := []schedulerapi.HostPriority{} //score int - scale of 0-10 // 0 being the lowest priority and 10 being the highest for nodeName, success := range labeledNodes { if success { score = 10 } else { score = 0 } result = append(result, schedulerapi.HostPriority{Host: nodeName, Score: score}) } return result, nil }
// ImageLocalityPriority is a priority function that favors nodes that already have requested pod container's images. // It will detect whether the requested images are present on a node, and then calculate a score ranging from 0 to 10 // based on the total size of those images. // - If none of the images are present, this node will be given the lowest priority. // - If some of the images are present on a node, the larger their sizes' sum, the higher the node's priority. func ImageLocalityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { sumSizeMap := make(map[string]int64) nodes, err := nodeLister.List() if err != nil { return nil, err } for _, container := range pod.Spec.Containers { for _, node := range nodes.Items { // Check if this container's image is present and get its size. imageSize := checkContainerImageOnNode(node, container) // Add this size to the total result of this node. sumSizeMap[node.Name] += imageSize } } result := []schedulerapi.HostPriority{} // score int - scale of 0-10 // 0 being the lowest priority and 10 being the highest. for nodeName, sumSize := range sumSizeMap { result = append(result, schedulerapi.HostPriority{Host: nodeName, Score: calculateScoreFromSize(sumSize)}) } return result, nil }
// Schedule tries to schedule the given pod to one of node in the node list. // If it succeeds, it will return the name of the node. // If it fails, it will return a Fiterror error with reasons. func (g *genericScheduler) Schedule(pod *api.Pod, nodeLister algorithm.NodeLister) (string, error) { nodes, err := nodeLister.List() if err != nil { return "", err } if len(nodes.Items) == 0 { return "", ErrNoNodesAvailable } // Used for all fit and priority funcs. nodeNameToInfo, err := g.cache.GetNodeNameToInfoMap() if err != nil { return "", err } filteredNodes, failedPredicateMap, err := findNodesThatFit(pod, nodeNameToInfo, g.predicates, nodes, g.extenders) if err != nil { return "", err } if len(filteredNodes.Items) == 0 { return "", &FitError{ Pod: pod, FailedPredicates: failedPredicateMap, } } priorityList, err := PrioritizeNodes(pod, nodeNameToInfo, g.prioritizers, algorithm.FakeNodeLister(filteredNodes), g.extenders) if err != nil { return "", err } return g.selectHost(priorityList) }
func (g *genericScheduler) Schedule(pod *api.Pod, nodeLister algorithm.NodeLister) (string, error) { nodes, err := nodeLister.List() if err != nil { return "", err } if len(nodes.Items) == 0 { return "", ErrNoNodesAvailable } filteredNodes, failedPredicateMap, err := findNodesThatFit(pod, g.pods, g.predicates, nodes) if err != nil { return "", err } priorityList, err := PrioritizeNodes(pod, g.pods, g.prioritizers, algorithm.FakeNodeLister(filteredNodes)) if err != nil { return "", err } if len(priorityList) == 0 { return "", &FitError{ Pod: pod, FailedPredicates: failedPredicateMap, } } return g.selectHost(priorityList) }
func (g *genericScheduler) Schedule(pod *api.Pod, nodeLister algorithm.NodeLister) (string, error) { nodes, err := nodeLister.List() if err != nil { return "", err } if len(nodes.Items) == 0 { return "", ErrNoNodesAvailable } // TODO: we should compute this once and dynamically update it using Watch, not constantly re-compute. // But at least we're now only doing it in one place machinesToPods, err := predicates.MapPodsToMachines(g.pods) if err != nil { return "", err } filteredNodes, failedPredicateMap, err := findNodesThatFit(pod, machinesToPods, g.predicates, nodes, g.extenders) if err != nil { return "", err } priorityList, err := PrioritizeNodes(pod, machinesToPods, g.pods, g.prioritizers, algorithm.FakeNodeLister(filteredNodes), g.extenders) if err != nil { return "", err } if len(priorityList) == 0 { return "", &FitError{ Pod: pod, FailedPredicates: failedPredicateMap, } } return g.selectHost(priorityList) }
// CalculateNodeAffinityPriority prioritizes nodes according to node affinity scheduling preferences // indicated in PreferredDuringSchedulingIgnoredDuringExecution. Each time a node match a preferredSchedulingTerm, // it will a get an add of preferredSchedulingTerm.Weight. Thus, the more preferredSchedulingTerms // the node satisfies and the more the preferredSchedulingTerm that is satisfied weights, the higher // score the node gets. func CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return nil, err } var maxCount float64 counts := make(map[string]float64, len(nodes.Items)) affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } // A nil element of PreferredDuringSchedulingIgnoredDuringExecution matches no objects. // An element of PreferredDuringSchedulingIgnoredDuringExecution that refers to an // empty PreferredSchedulingTerm matches all objects. if affinity.NodeAffinity != nil && affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil { // Match PreferredDuringSchedulingIgnoredDuringExecution term by term. for _, preferredSchedulingTerm := range affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution { if preferredSchedulingTerm.Weight == 0 { continue } nodeSelector, err := api.NodeSelectorRequirementsAsSelector(preferredSchedulingTerm.Preference.MatchExpressions) if err != nil { return nil, err } for _, node := range nodes.Items { if nodeSelector.Matches(labels.Set(node.Labels)) { counts[node.Name] += float64(preferredSchedulingTerm.Weight) } if counts[node.Name] > maxCount { maxCount = counts[node.Name] } } } } result := make(schedulerapi.HostPriorityList, 0, len(nodes.Items)) for i := range nodes.Items { node := &nodes.Items[i] if maxCount > 0 { fScore := 10 * (counts[node.Name] / maxCount) result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.Infof("%v -> %v: NodeAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore)) } } else { result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: 0}) } } return result, nil }
// ComputeTaintTolerationPriority prepares the priority list for all the nodes based on the number of intolerable taints on the node func ComputeTaintTolerationPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return nil, err } // the max value of counts var maxCount float64 // counts hold the count of intolerable taints of a pod for a given node counts := make(map[string]float64, len(nodes)) tolerations, err := api.GetTolerationsFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } // Fetch a list of all toleration with effect PreferNoSchedule tolerationList := getAllTolerationPreferNoSchedule(tolerations) // calculate the intolerable taints for all the nodes for _, node := range nodes { taints, err := api.GetTaintsFromNodeAnnotations(node.Annotations) if err != nil { return nil, err } count := countIntolerableTaintsPreferNoSchedule(taints, tolerationList) if count > 0 { // 0 is default value, so avoid unnecessary map operations. counts[node.Name] = count if count > maxCount { maxCount = count } } } // The maximum priority value to give to a node // Priority values range from 0 - maxPriority const maxPriority = float64(10) result := make(schedulerapi.HostPriorityList, 0, len(nodes)) for _, node := range nodes { fScore := maxPriority if maxCount > 0 { fScore = (1.0 - counts[node.Name]/maxCount) * 10 } if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.Infof("%v -> %v: Taint Toleration Priority, Score: (%d)", pod.Name, node.Name, int(fScore)) } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) } return result, nil }
// CalculateNodeAffinityPriority prioritizes nodes according to node affinity scheduling preferences // indicated in PreferredDuringSchedulingIgnoredDuringExecution. Each time a node match a preferredSchedulingTerm, // it will a get an add of preferredSchedulingTerm.Weight. Thus, the more preferredSchedulingTerms // the node satisfies and the more the preferredSchedulingTerm that is satisfied weights, the higher // score the node gets. func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { var maxCount int counts := map[string]int{} nodes, err := nodeLister.List() if err != nil { return nil, err } affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } // A nil element of PreferredDuringSchedulingIgnoredDuringExecution matches no objects. // An element of PreferredDuringSchedulingIgnoredDuringExecution that refers to an // empty PreferredSchedulingTerm matches all objects. if affinity.NodeAffinity != nil && affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil { // Match PreferredDuringSchedulingIgnoredDuringExecution term by term. for _, preferredSchedulingTerm := range affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution { if preferredSchedulingTerm.Weight == 0 { continue } nodeSelector, err := api.NodeSelectorRequirementsAsSelector(preferredSchedulingTerm.Preference.MatchExpressions) if err != nil { return nil, err } for _, node := range nodes.Items { if nodeSelector.Matches(labels.Set(node.Labels)) { counts[node.Name] += preferredSchedulingTerm.Weight } if counts[node.Name] > maxCount { maxCount = counts[node.Name] } } } } result := []schedulerapi.HostPriority{} for _, node := range nodes.Items { fScore := float64(0) if maxCount > 0 { fScore = 10 * (float64(counts[node.Name]) / float64(maxCount)) } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) glog.V(10).Infof("%v -> %v: NodeAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore)) } return result, nil }
// BalancedResourceAllocation favors nodes with balanced resource usage rate. // BalancedResourceAllocation should **NOT** be used alone, and **MUST** be used together with LeastRequestedPriority. // It calculates the difference between the cpu and memory fracion of capacity, and prioritizes the host based on how // close the two metrics are to each other. // Detail: score = 10 - abs(cpuFraction-memoryFraction)*10. The algorithm is partly inspired by: // "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization" func BalancedResourceAllocation(pod *api.Pod, machinesToPods map[string][]*api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return schedulerapi.HostPriorityList{}, err } list := schedulerapi.HostPriorityList{} for _, node := range nodes.Items { list = append(list, calculateBalancedResourceAllocation(pod, node, machinesToPods[node.Name])) } return list, nil }
// BalancedResourceAllocation favors nodes with balanced resource usage rate. // BalancedResourceAllocation should **NOT** be used alone, and **MUST** be used together with LeastRequestedPriority. // It calculates the difference between the cpu and memory fracion of capacity, and prioritizes the host based on how // close the two metrics are to each other. // Detail: score = 10 - abs(cpuFraction-memoryFraction)*10. The algorithm is partly inspired by: // "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization" func BalancedResourceAllocation(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return schedulerapi.HostPriorityList{}, err } list := schedulerapi.HostPriorityList{} for _, node := range nodes.Items { list = append(list, calculateBalancedResourceAllocation(pod, node, nodeNameToInfo[node.Name].Pods())) } return list, nil }
// LeastRequestedPriority is a priority function that favors nodes with fewer requested resources. // It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes // based on the minimum of the average of the fraction of requested to capacity. // Details: cpu((capacity - sum(requested)) * 10 / capacity) + memory((capacity - sum(requested)) * 10 / capacity) / 2 func LeastRequestedPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return schedulerapi.HostPriorityList{}, err } list := schedulerapi.HostPriorityList{} for _, node := range nodes.Items { list = append(list, calculateResourceOccupancy(pod, node, nodeNameToInfo[node.Name])) } return list, nil }
// LeastRequestedPriority is a priority function that favors nodes with fewer requested resources. // It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes // based on the minimum of the average of the fraction of requested to capacity. // Details: cpu((capacity - sum(requested)) * 10 / capacity) + memory((capacity - sum(requested)) * 10 / capacity) / 2 func LeastRequestedPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return schedulerapi.HostPriorityList{}, err } podsToMachines, err := predicates.MapPodsToMachines(podLister) list := schedulerapi.HostPriorityList{} for _, node := range nodes.Items { list = append(list, calculateResourceOccupancy(pod, node, podsToMachines[node.Name])) } return list, nil }
// BalancedResourceAllocation favors nodes with balanced resource usage rate. // BalancedResourceAllocation should **NOT** be used alone, and **MUST** be used together with LeastRequestedPriority. // It calculates the difference between the cpu and memory fracion of capacity, and prioritizes the host based on how // close the two metrics are to each other. // Detail: score = 10 - abs(cpuFraction-memoryFraction)*10. The algorithm is partly inspired by: // "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization" func BalancedResourceAllocation(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (algorithm.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return algorithm.HostPriorityList{}, err } podsToMachines, err := predicates.MapPodsToMachines(podLister) list := algorithm.HostPriorityList{} for _, node := range nodes.Items { list = append(list, calculateBalancedResourceAllocation(pod, node, podsToMachines[node.Name])) } return list, nil }
// LeastRequestedPriority is a priority function that favors nodes with fewer requested resources. // It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes // based on the minimum of the average of the fraction of requested to capacity. // Details: cpu((capacity - sum(requested)) * 10 / capacity) + memory((capacity - sum(requested)) * 10 / capacity) / 2 func LeastRequestedPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return schedulerapi.HostPriorityList{}, err } podResources := getNonZeroRequests(pod) list := make(schedulerapi.HostPriorityList, 0, len(nodes)) for _, node := range nodes { list = append(list, calculateResourceOccupancy(pod, podResources, node, nodeNameToInfo[node.Name])) } return list, nil }
// ComputeTaintTolerationPriority prepares the priority list for all the nodes based on the number of intolerable taints on the node func (s *TaintToleration) ComputeTaintTolerationPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { // counts hold the count of intolerable taints of a pod for a given node counts := make(map[string]int) // the max value of counts var maxCount int nodes, err := nodeLister.List() if err != nil { return nil, err } tolerations, err := api.GetTolerationsFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } // Fetch a list of all toleration with effect PreferNoSchedule tolerationList := getAllTolerationPreferNoSchedule(tolerations) // calculate the intolerable taints for all the nodes for i := range nodes.Items { node := &nodes.Items[i] taints, err := api.GetTaintsFromNodeAnnotations(node.Annotations) if err != nil { return nil, err } count := countIntolerableTaintsPreferNoSchedule(taints, tolerationList) counts[node.Name] = count if count > maxCount { maxCount = count } } // The maximum priority value to give to a node // Priority values range from 0 - maxPriority const maxPriority = 10 result := make(schedulerapi.HostPriorityList, 0, len(nodes.Items)) for _, node := range nodes.Items { fScore := float64(maxPriority) if maxCount > 0 { fScore = (1.0 - float64(counts[node.Name])/float64(maxCount)) * 10 } glog.V(10).Infof("%v -> %v: Taint Toleration Priority, Score: (%d)", pod.Name, node.Name, int(fScore)) result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) } return result, nil }
// Prioritizes the nodes by running the individual priority functions sequentially. // Each priority function is expected to set a score of 0-10 // 0 is the lowest priority score (least preferred node) and 10 is the highest // Each priority function can also have its own weight // The node scores returned by the priority function are multiplied by the weights to get weighted scores // All scores are finally combined (added) to get the total weighted scores of all nodes func PrioritizeNodes(pod *api.Pod, machinesToPods map[string][]*api.Pod, podLister algorithm.PodLister, priorityConfigs []algorithm.PriorityConfig, nodeLister algorithm.NodeLister, extenders []algorithm.SchedulerExtender) (schedulerapi.HostPriorityList, error) { result := schedulerapi.HostPriorityList{} // If no priority configs are provided, then the EqualPriority function is applied // This is required to generate the priority list in the required format if len(priorityConfigs) == 0 && len(extenders) == 0 { return EqualPriority(pod, machinesToPods, podLister, nodeLister) } combinedScores := map[string]int{} for _, priorityConfig := range priorityConfigs { weight := priorityConfig.Weight // skip the priority function if the weight is specified as 0 if weight == 0 { continue } priorityFunc := priorityConfig.Function prioritizedList, err := priorityFunc(pod, machinesToPods, podLister, nodeLister) if err != nil { return schedulerapi.HostPriorityList{}, err } for _, hostEntry := range prioritizedList { combinedScores[hostEntry.Host] += hostEntry.Score * weight } } if len(extenders) != 0 && nodeLister != nil { nodes, err := nodeLister.List() if err != nil { return schedulerapi.HostPriorityList{}, err } for _, extender := range extenders { prioritizedList, weight, err := extender.Prioritize(pod, &nodes) if err != nil { // Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities continue } for _, hostEntry := range *prioritizedList { combinedScores[hostEntry.Host] += hostEntry.Score * weight } } } for host, score := range combinedScores { glog.V(10).Infof("Host %s Score %d", host, score) result = append(result, schedulerapi.HostPriority{Host: host, Score: score}) } return result, nil }
// EqualPriority is a prioritizer function that gives an equal weight of one to all nodes func EqualPriority(_ *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (algorithm.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { glog.Errorf("failed to list nodes: %v", err) return []algorithm.HostPriority{}, err } result := []algorithm.HostPriority{} for _, node := range nodes.Items { result = append(result, algorithm.HostPriority{ Host: node.Name, Score: 1, }) } return result, nil }
func machine2Prioritizer(_ *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return []schedulerapi.HostPriority{}, err } result := []schedulerapi.HostPriority{} for _, node := range nodes.Items { score := 1 if node.Name == "machine2" { score = 10 } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: score}) } return result, nil }
// EqualPriority is a prioritizer function that gives an equal weight of one to all nodes func EqualPriority(_ *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { glog.Errorf("Failed to list nodes: %v", err) return []schedulerapi.HostPriority{}, err } result := []schedulerapi.HostPriority{} for _, node := range nodes.Items { result = append(result, schedulerapi.HostPriority{ Host: node.Name, Score: 1, }) } return result, nil }
// Schedule tries to schedule the given pod to one of node in the node list. // If it succeeds, it will return the name of the node. // If it fails, it will return a Fiterror error with reasons. func (g *genericScheduler) Schedule(pod *api.Pod, nodeLister algorithm.NodeLister) (string, error) { var trace *util.Trace if pod != nil { trace = util.NewTrace(fmt.Sprintf("Scheduling %s/%s", pod.Namespace, pod.Name)) } else { trace = util.NewTrace("Scheduling <nil> pod") } defer trace.LogIfLong(20 * time.Millisecond) nodes, err := nodeLister.List() if err != nil { return "", err } if len(nodes) == 0 { return "", ErrNoNodesAvailable } // Used for all fit and priority funcs. err = g.cache.UpdateNodeNameToInfoMap(g.cachedNodeInfoMap) if err != nil { return "", err } trace.Step("Computing predicates") filteredNodes, failedPredicateMap, err := findNodesThatFit(pod, g.cachedNodeInfoMap, nodes, g.predicates, g.extenders) if err != nil { return "", err } if len(filteredNodes) == 0 { return "", &FitError{ Pod: pod, FailedPredicates: failedPredicateMap, } } trace.Step("Prioritizing") meta := g.priorityMetaProducer(pod) priorityList, err := PrioritizeNodes(pod, g.cachedNodeInfoMap, meta, g.prioritizers, filteredNodes, g.extenders) if err != nil { return "", err } trace.Step("Selecting host") return g.selectHost(priorityList) }
func numericPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (algorithm.HostPriorityList, error) { nodes, err := nodeLister.List() result := []algorithm.HostPriority{} if err != nil { return nil, fmt.Errorf("failed to list nodes: %v", err) } for _, node := range nodes.Items { score, err := strconv.Atoi(node.Name) if err != nil { return nil, err } result = append(result, algorithm.HostPriority{ Host: node.Name, Score: score, }) } return result, nil }
func numericPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() result := []schedulerapi.HostPriority{} if err != nil { return nil, fmt.Errorf("failed to list nodes: %v", err) } for _, node := range nodes.Items { score, err := strconv.Atoi(node.Name) if err != nil { return nil, err } result = append(result, schedulerapi.HostPriority{ Host: node.Name, Score: score, }) } return result, nil }
// compute a sum by iterating through the elements of weightedPodAffinityTerm and adding // "weight" to the sum if the corresponding PodAffinityTerm is satisfied for // that node; the node(s) with the highest sum are the most preferred. // Symmetry need to be considered for preferredDuringSchedulingIgnoredDuringExecution from podAffinity & podAntiAffinity, // symmetry need to be considered for hard requirements from podAffinity func (ipa *InterPodAffinity) CalculateInterPodAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return nil, err } allPods, err := ipa.podLister.List(labels.Everything()) if err != nil { return nil, err } affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } // convert the topology key based weights to the node name based weights var maxCount float64 var minCount float64 // counts store the mapping from node name to so-far computed score of // the node. counts := make(map[string]float64, len(nodes)) processTerm := func(term *api.PodAffinityTerm, affinityPod, podToCheck *api.Pod, fixedNode *api.Node, weight float64) error { match, err := podMatchesNamespaceAndSelector(podToCheck, affinityPod, term) if err != nil { return err } if match { for _, node := range nodes { if ipa.failureDomains.NodesHaveSameTopologyKey(node, fixedNode, term.TopologyKey) { counts[node.Name] += weight } } } return nil } processTerms := func(terms []api.WeightedPodAffinityTerm, affinityPod, podToCheck *api.Pod, fixedNode *api.Node, multiplier int) error { for _, weightedTerm := range terms { if err := processTerm(&weightedTerm.PodAffinityTerm, affinityPod, podToCheck, fixedNode, float64(weightedTerm.Weight*multiplier)); err != nil { return err } } return nil } for _, existingPod := range allPods { existingPodNode, err := ipa.info.GetNodeInfo(existingPod.Spec.NodeName) if err != nil { return nil, err } existingPodAffinity, err := api.GetAffinityFromPodAnnotations(existingPod.Annotations) if err != nil { return nil, err } if affinity.PodAffinity != nil { // For every soft pod affinity term of <pod>, if <existingPod> matches the term, // increment <counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPods>`s node by the term`s weight. terms := affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution if err := processTerms(terms, pod, existingPod, existingPodNode, 1); err != nil { return nil, err } } if affinity.PodAntiAffinity != nil { // For every soft pod anti-affinity term of <pod>, if <existingPod> matches the term, // decrement <counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>`s node by the term`s weight. terms := affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution if err := processTerms(terms, pod, existingPod, existingPodNode, -1); err != nil { return nil, err } } if existingPodAffinity.PodAffinity != nil { // For every hard pod affinity term of <existingPod>, if <pod> matches the term, // increment <counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the constant <ipa.hardPodAffinityWeight> if ipa.hardPodAffinityWeight > 0 { terms := existingPodAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // terms = append(terms, existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} for _, term := range terms { if err := processTerm(&term, existingPod, pod, existingPodNode, float64(ipa.hardPodAffinityWeight)); err != nil { return nil, err } } } // For every soft pod affinity term of <existingPod>, if <pod> matches the term, // increment <counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the term's weight. terms := existingPodAffinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution if err := processTerms(terms, existingPod, pod, existingPodNode, 1); err != nil { return nil, err } } if existingPodAffinity.PodAntiAffinity != nil { // For every soft pod anti-affinity term of <existingPod>, if <pod> matches the term, // decrement <counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the term's weight. terms := existingPodAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution if err := processTerms(terms, existingPod, pod, existingPodNode, -1); err != nil { return nil, err } } } for _, node := range nodes { if counts[node.Name] > maxCount { maxCount = counts[node.Name] } if counts[node.Name] < minCount { minCount = counts[node.Name] } } // calculate final priority score for each node result := make(schedulerapi.HostPriorityList, 0, len(nodes)) for _, node := range nodes { fScore := float64(0) if (maxCount - minCount) > 0 { fScore = 10 * ((counts[node.Name] - minCount) / (maxCount - minCount)) } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.V(10).Infof("%v -> %v: InterPodAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore)) } } return result, nil }
// compute a sum by iterating through the elements of weightedPodAffinityTerm and adding // "weight" to the sum if the corresponding PodAffinityTerm is satisfied for // that node; the node(s) with the highest sum are the most preferred. // Symmetry need to be considered for preferredDuringSchedulingIgnoredDuringExecution from podAffinity & podAntiAffinity, // symmetry need to be considered for hard requirements from podAffinity func (ipa *InterPodAffinity) CalculateInterPodAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return nil, err } allPods, err := ipa.podLister.List(labels.Everything()) if err != nil { return nil, err } affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } // convert the topology key based weights to the node name based weights var maxCount int var minCount int counts := map[string]int{} for _, node := range nodes.Items { totalCount := 0 // count weights for the weighted pod affinity if affinity.PodAffinity != nil { for _, weightedTerm := range affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution { weightedCount, err := ipa.CountWeightByPodMatchAffinityTerm(pod, allPods, weightedTerm.Weight, weightedTerm.PodAffinityTerm, &node) if err != nil { return nil, err } totalCount += weightedCount } } // count weights for the weighted pod anti-affinity if affinity.PodAntiAffinity != nil { for _, weightedTerm := range affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution { weightedCount, err := ipa.CountWeightByPodMatchAffinityTerm(pod, allPods, (0 - weightedTerm.Weight), weightedTerm.PodAffinityTerm, &node) if err != nil { return nil, err } totalCount += weightedCount } } // reverse direction checking: count weights for the inter-pod affinity/anti-affinity rules // that are indicated by existing pods on the node. for _, ep := range allPods { epAffinity, err := api.GetAffinityFromPodAnnotations(ep.Annotations) if err != nil { return nil, err } if epAffinity.PodAffinity != nil { // count the implicit weight for the hard pod affinity indicated by the existing pod. if ipa.hardPodAffinityWeight > 0 { var podAffinityTerms []api.PodAffinityTerm if len(epAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { podAffinityTerms = epAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution } // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // podAffinityTerms = append(podAffinityTerms, affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} for _, epAffinityTerm := range podAffinityTerms { match, err := ipa.failureDomains.CheckIfPodMatchPodAffinityTerm(pod, ep, epAffinityTerm, func(pod *api.Pod) (*api.Node, error) { return &node, nil }, func(ep *api.Pod) (*api.Node, error) { return ipa.info.GetNodeInfo(ep.Spec.NodeName) }, ) if err != nil { return nil, err } if match { totalCount += ipa.hardPodAffinityWeight } } } // count weight for the weighted pod affinity indicated by the existing pod. for _, epWeightedTerm := range epAffinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution { match, err := ipa.failureDomains.CheckIfPodMatchPodAffinityTerm(pod, ep, epWeightedTerm.PodAffinityTerm, func(pod *api.Pod) (*api.Node, error) { return &node, nil }, func(ep *api.Pod) (*api.Node, error) { return ipa.info.GetNodeInfo(ep.Spec.NodeName) }, ) if err != nil { return nil, err } if match { totalCount += epWeightedTerm.Weight } } } // count weight for the weighted pod anti-affinity indicated by the existing pod. if epAffinity.PodAntiAffinity != nil { for _, epWeightedTerm := range epAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution { match, err := ipa.failureDomains.CheckIfPodMatchPodAffinityTerm(pod, ep, epWeightedTerm.PodAffinityTerm, func(pod *api.Pod) (*api.Node, error) { return &node, nil }, func(ep *api.Pod) (*api.Node, error) { return ipa.info.GetNodeInfo(ep.Spec.NodeName) }, ) if err != nil { return nil, err } if match { totalCount -= epWeightedTerm.Weight } } } } counts[node.Name] = totalCount if counts[node.Name] > maxCount { maxCount = counts[node.Name] } if counts[node.Name] < minCount { minCount = counts[node.Name] } } // calculate final priority score for each node result := []schedulerapi.HostPriority{} for _, node := range nodes.Items { fScore := float64(0) if (maxCount - minCount) > 0 { fScore = 10 * (float64(counts[node.Name]-minCount) / float64(maxCount-minCount)) } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) glog.V(10).Infof( "%v -> %v: InterPodAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore), ) } return result, nil }
// Prioritizes the nodes by running the individual priority functions in parallel. // Each priority function is expected to set a score of 0-10 // 0 is the lowest priority score (least preferred node) and 10 is the highest // Each priority function can also have its own weight // The node scores returned by the priority function are multiplied by the weights to get weighted scores // All scores are finally combined (added) to get the total weighted scores of all nodes func PrioritizeNodes( pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, priorityConfigs []algorithm.PriorityConfig, nodeLister algorithm.NodeLister, extenders []algorithm.SchedulerExtender, ) (schedulerapi.HostPriorityList, error) { result := schedulerapi.HostPriorityList{} // If no priority configs are provided, then the EqualPriority function is applied // This is required to generate the priority list in the required format if len(priorityConfigs) == 0 && len(extenders) == 0 { return EqualPriority(pod, nodeNameToInfo, nodeLister) } var ( mu = sync.Mutex{} wg = sync.WaitGroup{} combinedScores = map[string]int{} errs []error ) for _, priorityConfig := range priorityConfigs { // skip the priority function if the weight is specified as 0 if priorityConfig.Weight == 0 { continue } wg.Add(1) go func(config algorithm.PriorityConfig) { defer wg.Done() weight := config.Weight priorityFunc := config.Function prioritizedList, err := priorityFunc(pod, nodeNameToInfo, nodeLister) mu.Lock() defer mu.Unlock() if err != nil { errs = append(errs, err) return } for i := range prioritizedList { host, score := prioritizedList[i].Host, prioritizedList[i].Score combinedScores[host] += score * weight } }(priorityConfig) } if len(errs) != 0 { return schedulerapi.HostPriorityList{}, errors.NewAggregate(errs) } // wait for all go routines to finish wg.Wait() if len(extenders) != 0 && nodeLister != nil { nodes, err := nodeLister.List() if err != nil { return schedulerapi.HostPriorityList{}, err } for _, extender := range extenders { wg.Add(1) go func(ext algorithm.SchedulerExtender) { defer wg.Done() prioritizedList, weight, err := ext.Prioritize(pod, &nodes) if err != nil { // Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities return } mu.Lock() for i := range *prioritizedList { host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score combinedScores[host] += score * weight } mu.Unlock() }(extender) } } // wait for all go routines to finish wg.Wait() for host, score := range combinedScores { glog.V(10).Infof("Host %s Score %d", host, score) result = append(result, schedulerapi.HostPriority{Host: host, Score: score}) } return result, nil }
// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller. // When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors. // It favors nodes that have fewer existing matching pods. // i.e. it pushes the scheduler towards a node where there's the smallest number of // pods which match the same service selectors or RC selectors as the pod being scheduled. // Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods. func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { selectors := make([]labels.Selector, 0) services, err := s.serviceLister.GetPodServices(pod) if err == nil { for _, service := range services { selectors = append(selectors, labels.SelectorFromSet(service.Spec.Selector)) } } rcs, err := s.controllerLister.GetPodControllers(pod) if err == nil { for _, rc := range rcs { selectors = append(selectors, labels.SelectorFromSet(rc.Spec.Selector)) } } rss, err := s.replicaSetLister.GetPodReplicaSets(pod) if err == nil { for _, rs := range rss { if selector, err := unversioned.LabelSelectorAsSelector(rs.Spec.Selector); err == nil { selectors = append(selectors, selector) } } } nodes, err := nodeLister.List() if err != nil { return nil, err } // Count similar pods by node countsByNodeName := map[string]int{} countsByNodeNameLock := sync.Mutex{} if len(selectors) > 0 { // Create a number of go-routines that will be computing number // of "similar" pods for given nodes. workers := 16 toProcess := make(chan string, len(nodes.Items)) for i := range nodes.Items { toProcess <- nodes.Items[i].Name } close(toProcess) wg := sync.WaitGroup{} wg.Add(workers) for i := 0; i < workers; i++ { go func() { defer utilruntime.HandleCrash() defer wg.Done() for { nodeName, ok := <-toProcess if !ok { return } count := 0 for _, nodePod := range nodeNameToInfo[nodeName].Pods() { if pod.Namespace != nodePod.Namespace { continue } // When we are replacing a failed pod, we often see the previous // deleted version while scheduling the replacement. // Ignore the previous deleted version for spreading purposes // (it can still be considered for resource restrictions etc.) if nodePod.DeletionTimestamp != nil { glog.V(4).Infof("skipping pending-deleted pod: %s/%s", nodePod.Namespace, nodePod.Name) continue } matches := false for _, selector := range selectors { if selector.Matches(labels.Set(nodePod.ObjectMeta.Labels)) { matches = true break } } if matches { count++ } } func() { countsByNodeNameLock.Lock() defer countsByNodeNameLock.Unlock() countsByNodeName[nodeName] = count }() } }() } wg.Wait() } // Aggregate by-node information // Compute the maximum number of pods hosted on any node maxCountByNodeName := 0 for _, count := range countsByNodeName { if count > maxCountByNodeName { maxCountByNodeName = count } } // Count similar pods by zone, if zone information is present countsByZone := map[string]int{} for i := range nodes.Items { node := &nodes.Items[i] count, found := countsByNodeName[node.Name] if !found { continue } zoneId := getZoneKey(node) if zoneId == "" { continue } countsByZone[zoneId] += count } // Aggregate by-zone information // Compute the maximum number of pods hosted in any zone haveZones := len(countsByZone) != 0 maxCountByZone := 0 for _, count := range countsByZone { if count > maxCountByZone { maxCountByZone = count } } result := []schedulerapi.HostPriority{} //score int - scale of 0-maxPriority // 0 being the lowest priority and maxPriority being the highest for i := range nodes.Items { node := &nodes.Items[i] // initializing to the default/max node score of maxPriority fScore := float32(maxPriority) if maxCountByNodeName > 0 { fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName)) } // If there is zone information present, incorporate it if haveZones { zoneId := getZoneKey(node) if zoneId != "" { zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone)) fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore) } } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) glog.V(10).Infof( "%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore), ) } return result, nil }
// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under // Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of // pods which match the same selectors of Services and RCs as current pod. func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (algorithm.HostPriorityList, error) { var maxCount int var nsPods []*api.Pod selectors := make([]labels.Selector, 0) services, err := s.serviceLister.GetPodServices(pod) if err == nil { for _, service := range services { selectors = append(selectors, labels.SelectorFromSet(service.Spec.Selector)) } } controllers, err := s.controllerLister.GetPodControllers(pod) if err == nil { for _, controller := range controllers { selectors = append(selectors, labels.SelectorFromSet(controller.Spec.Selector)) } } if len(selectors) > 0 { pods, err := podLister.List(labels.Everything()) if err != nil { return nil, err } // consider only the pods that belong to the same namespace for _, nsPod := range pods { if nsPod.Namespace == pod.Namespace { nsPods = append(nsPods, nsPod) } } } nodes, err := nodeLister.List() if err != nil { return nil, err } counts := map[string]int{} if len(nsPods) > 0 { for _, pod := range nsPods { matches := false for _, selector := range selectors { if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) { matches = true break } } if matches { counts[pod.Spec.NodeName]++ // Compute the maximum number of pods hosted on any node if counts[pod.Spec.NodeName] > maxCount { maxCount = counts[pod.Spec.NodeName] } } } } result := []algorithm.HostPriority{} //score int - scale of 0-10 // 0 being the lowest priority and 10 being the highest for _, node := range nodes.Items { // initializing to the default/max node score of 10 fScore := float32(10) if maxCount > 0 { fScore = 10 * (float32(maxCount-counts[node.Name]) / float32(maxCount)) } result = append(result, algorithm.HostPriority{Host: node.Name, Score: int(fScore)}) glog.V(10).Infof( "%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore), ) } return result, nil }
func (npa *NodePreferAvoidPod) CalculateNodePreferAvoidPodsPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return nil, err } // TODO: Once we have ownerReference fully implemented, use it to find controller for the pod. rcs, err := npa.controllerLister.GetPodControllers(pod) rss, err := npa.replicaSetLister.GetPodReplicaSets(pod) if len(rcs) == 0 && len(rss) == 0 { result := make(schedulerapi.HostPriorityList, 0, len(nodes)) for _, node := range nodes { result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: 10}) } return result, nil } avoidNodes := make(map[string]bool, len(nodes)) avoidNode := false for _, node := range nodes { avoids, err := api.GetAvoidPodsFromNodeAnnotations(node.Annotations) if err != nil { continue } avoidNode = false for i := range avoids.PreferAvoidPods { avoid := &avoids.PreferAvoidPods[i] // TODO: Once we have controllerRef implemented there will be at most one owner // of our pod. That said we won't even need loop theoretically. That said for // code simplicity, we can get rid of all breaks. // Also, we can simply compare fields from ownerRef with avoid. for _, rc := range rcs { if avoid.PodSignature.PodController.Kind == "ReplicationController" && avoid.PodSignature.PodController.UID == rc.UID { avoidNode = true } } for _, rs := range rss { if avoid.PodSignature.PodController.Kind == "ReplicaSet" && avoid.PodSignature.PodController.UID == rs.UID { avoidNode = true } } if avoidNode { // false is default value, so we don't even need to set it // to avoid unnecessary map operations. avoidNodes[node.Name] = true break } } } var score int result := make(schedulerapi.HostPriorityList, 0, len(nodes)) //score int - scale of 0-10 // 0 being the lowest priority and 10 being the highest for _, node := range nodes { if avoidNodes[node.Name] { score = 0 } else { score = 10 } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: score}) } return result, nil }
// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller. // When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors. // It favors nodes that have fewer existing matching pods. // i.e. it pushes the scheduler towards a node where there's the smallest number of // pods which match the same service selectors or RC selectors as the pod being scheduled. // Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods. func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { var nsPods []*api.Pod selectors := make([]labels.Selector, 0) services, err := s.serviceLister.GetPodServices(pod) if err == nil { for _, service := range services { selectors = append(selectors, labels.SelectorFromSet(service.Spec.Selector)) } } controllers, err := s.controllerLister.GetPodControllers(pod) if err == nil { for _, controller := range controllers { selectors = append(selectors, labels.SelectorFromSet(controller.Spec.Selector)) } } if len(selectors) > 0 { pods, err := s.podLister.List(labels.Everything()) if err != nil { return nil, err } // consider only the pods that belong to the same namespace for _, nsPod := range pods { if nsPod.Namespace == pod.Namespace { nsPods = append(nsPods, nsPod) } } } nodes, err := nodeLister.List() if err != nil { return nil, err } // Count similar pods by node countsByNodeName := map[string]int{} for _, pod := range nsPods { // When we are replacing a failed pod, we often see the previous deleted version // while scheduling the replacement. Ignore the previous deleted version for spreading // purposes (it can still be considered for resource restrictions etc.) if pod.DeletionTimestamp != nil { glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name) continue } matches := false for _, selector := range selectors { if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) { matches = true break } } if !matches { continue } countsByNodeName[pod.Spec.NodeName]++ } // Aggregate by-node information // Compute the maximum number of pods hosted on any node maxCountByNodeName := 0 for _, count := range countsByNodeName { if count > maxCountByNodeName { maxCountByNodeName = count } } // Count similar pods by zone, if zone information is present countsByZone := map[string]int{} for i := range nodes.Items { node := &nodes.Items[i] count, found := countsByNodeName[node.Name] if !found { continue } zoneId := getZoneKey(node) if zoneId == "" { continue } countsByZone[zoneId] += count } // Aggregate by-zone information // Compute the maximum number of pods hosted in any zone haveZones := len(countsByZone) != 0 maxCountByZone := 0 for _, count := range countsByZone { if count > maxCountByZone { maxCountByZone = count } } result := []schedulerapi.HostPriority{} //score int - scale of 0-maxPriority // 0 being the lowest priority and maxPriority being the highest for i := range nodes.Items { node := &nodes.Items[i] // initializing to the default/max node score of maxPriority fScore := float32(maxPriority) if maxCountByNodeName > 0 { fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName)) } // If there is zone information present, incorporate it if haveZones { zoneId := getZoneKey(node) if zoneId != "" { zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone)) fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore) } } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) glog.V(10).Infof( "%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore), ) } return result, nil }
// CalculateAntiAffinityPriority spreads pods by minimizing the number of pods belonging to the same service // on machines with the same value for a particular label. // The label to be considered is provided to the struct (ServiceAntiAffinity). func (s *ServiceAntiAffinity) CalculateAntiAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { var nsServicePods []*api.Pod services, err := s.serviceLister.GetPodServices(pod) if err == nil { // just use the first service and get the other pods within the service // TODO: a separate predicate can be created that tries to handle all services for the pod selector := labels.SelectorFromSet(services[0].Spec.Selector) pods, err := s.podLister.List(selector) if err != nil { return nil, err } // consider only the pods that belong to the same namespace for _, nsPod := range pods { if nsPod.Namespace == pod.Namespace { nsServicePods = append(nsServicePods, nsPod) } } } nodes, err := nodeLister.List() if err != nil { return nil, err } // separate out the nodes that have the label from the ones that don't otherNodes := []string{} labeledNodes := map[string]string{} for _, node := range nodes.Items { if labels.Set(node.Labels).Has(s.label) { label := labels.Set(node.Labels).Get(s.label) labeledNodes[node.Name] = label } else { otherNodes = append(otherNodes, node.Name) } } podCounts := map[string]int{} for _, pod := range nsServicePods { label, exists := labeledNodes[pod.Spec.NodeName] if !exists { continue } podCounts[label]++ } numServicePods := len(nsServicePods) result := []schedulerapi.HostPriority{} //score int - scale of 0-maxPriority // 0 being the lowest priority and maxPriority being the highest for node := range labeledNodes { // initializing to the default/max node score of maxPriority fScore := float32(maxPriority) if numServicePods > 0 { fScore = maxPriority * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods)) } result = append(result, schedulerapi.HostPriority{Host: node, Score: int(fScore)}) } // add the open nodes with a score of 0 for _, node := range otherNodes { result = append(result, schedulerapi.HostPriority{Host: node, Score: 0}) } return result, nil }