func hasPodAffinityConstraints(pod *v1.Pod) bool { affinity, err := v1.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil || affinity == nil { return false } return affinity.PodAffinity != nil || affinity.PodAntiAffinity != nil }
func (c *PodAffinityChecker) getMatchingAntiAffinityTerms(pod *v1.Pod, allPods []*v1.Pod) ([]matchingPodAntiAffinityTerm, error) { var result []matchingPodAntiAffinityTerm for _, existingPod := range allPods { affinity, err := v1.GetAffinityFromPodAnnotations(existingPod.Annotations) if err != nil { return nil, err } if affinity != nil && affinity.PodAntiAffinity != nil { existingPodNode, err := c.info.GetNodeInfo(existingPod.Spec.NodeName) if err != nil { return nil, err } for _, term := range getPodAntiAffinityTerms(affinity.PodAntiAffinity) { match, err := priorityutil.PodMatchesTermsNamespaceAndSelector(pod, existingPod, &term) if err != nil { return nil, err } if match { result = append(result, matchingPodAntiAffinityTerm{term: &term, node: existingPodNode}) } } } } return result, nil }
func (c *PodAffinityChecker) InterPodAffinityMatches(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { node := nodeInfo.Node() if node == nil { return false, nil, fmt.Errorf("node not found") } if !c.satisfiesExistingPodsAntiAffinity(pod, meta, node) { return false, []algorithm.PredicateFailureReason{ErrPodAffinityNotMatch}, nil } // Now check if <pod> requirements will be satisfied on this node. affinity, err := v1.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return false, nil, err } if affinity == nil || (affinity.PodAffinity == nil && affinity.PodAntiAffinity == nil) { return true, nil, nil } if !c.satisfiesPodsAffinityAntiAffinity(pod, node, affinity) { return false, []algorithm.PredicateFailureReason{ErrPodAffinityNotMatch}, nil } if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.Infof("Schedule Pod %+v on Node %+v is allowed, pod (anti)affinity constraints satisfied", podName(pod), node.Name) } return true, nil, nil }
func getMatchingAntiAffinityTerms(pod *v1.Pod, nodeInfoMap map[string]*schedulercache.NodeInfo) ([]matchingPodAntiAffinityTerm, error) { allNodeNames := make([]string, 0, len(nodeInfoMap)) for name := range nodeInfoMap { allNodeNames = append(allNodeNames, name) } var lock sync.Mutex var result []matchingPodAntiAffinityTerm var firstError error appendResult := func(toAppend []matchingPodAntiAffinityTerm) { lock.Lock() defer lock.Unlock() result = append(result, toAppend...) } catchError := func(err error) { lock.Lock() defer lock.Unlock() if firstError == nil { firstError = err } } processNode := func(i int) { nodeInfo := nodeInfoMap[allNodeNames[i]] node := nodeInfo.Node() if node == nil { catchError(fmt.Errorf("node not found")) return } var nodeResult []matchingPodAntiAffinityTerm for _, existingPod := range nodeInfo.PodsWithAffinity() { affinity, err := v1.GetAffinityFromPodAnnotations(existingPod.Annotations) if err != nil { catchError(err) return } if affinity == nil { continue } for _, term := range getPodAntiAffinityTerms(affinity.PodAntiAffinity) { namespaces := priorityutil.GetNamespacesFromPodAffinityTerm(pod, &term) selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector) if err != nil { catchError(err) return } match := priorityutil.PodMatchesTermsNamespaceAndSelector(pod, namespaces, selector) if match { nodeResult = append(nodeResult, matchingPodAntiAffinityTerm{term: &term, node: node}) } } } if len(nodeResult) > 0 { appendResult(nodeResult) } } workqueue.Parallelize(16, len(allNodeNames), processNode) return result, firstError }
// The pod can only schedule onto nodes that satisfy requirements in both NodeAffinity and nodeSelector. func podMatchesNodeLabels(pod *v1.Pod, node *v1.Node) bool { // Check if node.Labels match pod.Spec.NodeSelector. if len(pod.Spec.NodeSelector) > 0 { selector := labels.SelectorFromSet(pod.Spec.NodeSelector) if !selector.Matches(labels.Set(node.Labels)) { return false } } // Parse required node affinity scheduling requirements // and check if the current node match the requirements. affinity, err := v1.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { glog.V(10).Infof("Failed to get Affinity from Pod %+v, err: %+v", podName(pod), err) return false } // 1. nil NodeSelector matches all nodes (i.e. does not filter out any nodes) // 2. nil []NodeSelectorTerm (equivalent to non-nil empty NodeSelector) matches no nodes // 3. zero-length non-nil []NodeSelectorTerm matches no nodes also, just for simplicity // 4. nil []NodeSelectorRequirement (equivalent to non-nil empty NodeSelectorTerm) matches no nodes // 5. zero-length non-nil []NodeSelectorRequirement matches no nodes also, just for simplicity // 6. non-nil empty NodeSelectorRequirement is not allowed nodeAffinityMatches := true if affinity != nil && affinity.NodeAffinity != nil { nodeAffinity := affinity.NodeAffinity // if no required NodeAffinity requirements, will do no-op, means select all nodes. // TODO: Replace next line with subsequent commented-out line when implement RequiredDuringSchedulingRequiredDuringExecution. if nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { // if nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution == nil && nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { return true } // Match node selector for requiredDuringSchedulingRequiredDuringExecution. // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. // if nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution != nil { // nodeSelectorTerms := nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution.NodeSelectorTerms // glog.V(10).Infof("Match for RequiredDuringSchedulingRequiredDuringExecution node selector terms %+v", nodeSelectorTerms) // nodeAffinityMatches = nodeMatchesNodeSelectorTerms(node, nodeSelectorTerms) // } // Match node selector for requiredDuringSchedulingIgnoredDuringExecution. if nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { nodeSelectorTerms := nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms glog.V(10).Infof("Match for RequiredDuringSchedulingIgnoredDuringExecution node selector terms %+v", nodeSelectorTerms) nodeAffinityMatches = nodeAffinityMatches && nodeMatchesNodeSelectorTerms(node, nodeSelectorTerms) } } return nodeAffinityMatches }
// CalculateNodeAffinityPriority prioritizes nodes according to node affinity scheduling preferences // indicated in PreferredDuringSchedulingIgnoredDuringExecution. Each time a node match a preferredSchedulingTerm, // it will a get an add of preferredSchedulingTerm.Weight. Thus, the more preferredSchedulingTerms // the node satisfies and the more the preferredSchedulingTerm that is satisfied weights, the higher // score the node gets. func CalculateNodeAffinityPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { node := nodeInfo.Node() if node == nil { return schedulerapi.HostPriority{}, fmt.Errorf("node not found") } var affinity *v1.Affinity if priorityMeta, ok := meta.(*priorityMetadata); ok { affinity = priorityMeta.affinity } else { // We couldn't parse metadata - fallback to computing it. var err error affinity, err = v1.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return schedulerapi.HostPriority{}, err } } var count int32 // A nil element of PreferredDuringSchedulingIgnoredDuringExecution matches no objects. // An element of PreferredDuringSchedulingIgnoredDuringExecution that refers to an // empty PreferredSchedulingTerm matches all objects. if affinity != nil && affinity.NodeAffinity != nil && affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil { // Match PreferredDuringSchedulingIgnoredDuringExecution term by term. for i := range affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution { preferredSchedulingTerm := &affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution[i] if preferredSchedulingTerm.Weight == 0 { continue } // TODO: Avoid computing it for all nodes if this becomes a performance problem. nodeSelector, err := v1.NodeSelectorRequirementsAsSelector(preferredSchedulingTerm.Preference.MatchExpressions) if err != nil { return schedulerapi.HostPriority{}, err } if nodeSelector.Matches(labels.Set(node.Labels)) { count += preferredSchedulingTerm.Weight } } } return schedulerapi.HostPriority{ Host: node.Name, Score: int(count), }, nil }
// PriorityMetadata is a MetadataProducer. Node info can be nil. func PriorityMetadata(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo) interface{} { // If we cannot compute metadata, just return nil if pod == nil { return nil } tolerations, err := getTolerationListFromPod(pod) if err != nil { return nil } affinity, err := v1.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil } return &priorityMetadata{ nonZeroRequest: getNonZeroRequests(pod), podTolerations: tolerations, affinity: affinity, } }
// compute a sum by iterating through the elements of weightedPodAffinityTerm and adding // "weight" to the sum if the corresponding PodAffinityTerm is satisfied for // that node; the node(s) with the highest sum are the most preferred. // Symmetry need to be considered for preferredDuringSchedulingIgnoredDuringExecution from podAffinity & podAntiAffinity, // symmetry need to be considered for hard requirements from podAffinity func (ipa *InterPodAffinity) CalculateInterPodAffinityPriority(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*v1.Node) (schedulerapi.HostPriorityList, error) { affinity, err := v1.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } hasAffinityConstraints := affinity != nil && affinity.PodAffinity != nil hasAntiAffinityConstraints := affinity != nil && affinity.PodAntiAffinity != nil allNodeNames := make([]string, 0, len(nodeNameToInfo)) for name := range nodeNameToInfo { allNodeNames = append(allNodeNames, name) } // convert the topology key based weights to the node name based weights var maxCount float64 var minCount float64 // priorityMap stores the mapping from node name to so-far computed score of // the node. pm := newPodAffinityPriorityMap(nodes, ipa.failureDomains) processPod := func(existingPod *v1.Pod) error { existingPodNode, err := ipa.info.GetNodeInfo(existingPod.Spec.NodeName) if err != nil { return err } existingPodAffinity, err := v1.GetAffinityFromPodAnnotations(existingPod.Annotations) if err != nil { return err } existingHasAffinityConstraints := existingPodAffinity != nil && existingPodAffinity.PodAffinity != nil existingHasAntiAffinityConstraints := existingPodAffinity != nil && existingPodAffinity.PodAntiAffinity != nil if hasAffinityConstraints { // For every soft pod affinity term of <pod>, if <existingPod> matches the term, // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPods>`s node by the term`s weight. terms := affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, pod, existingPod, existingPodNode, 1) } if hasAntiAffinityConstraints { // For every soft pod anti-affinity term of <pod>, if <existingPod> matches the term, // decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>`s node by the term`s weight. terms := affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, pod, existingPod, existingPodNode, -1) } if existingHasAffinityConstraints { // For every hard pod affinity term of <existingPod>, if <pod> matches the term, // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the constant <ipa.hardPodAffinityWeight> if ipa.hardPodAffinityWeight > 0 { terms := existingPodAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // terms = append(terms, existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} for _, term := range terms { pm.processTerm(&term, existingPod, pod, existingPodNode, float64(ipa.hardPodAffinityWeight)) } } // For every soft pod affinity term of <existingPod>, if <pod> matches the term, // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the term's weight. terms := existingPodAffinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, existingPod, pod, existingPodNode, 1) } if existingHasAntiAffinityConstraints { // For every soft pod anti-affinity term of <existingPod>, if <pod> matches the term, // decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the term's weight. terms := existingPodAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, existingPod, pod, existingPodNode, -1) } return nil } processNode := func(i int) { nodeInfo := nodeNameToInfo[allNodeNames[i]] if hasAffinityConstraints || hasAntiAffinityConstraints { // We need to process all the nodes. for _, existingPod := range nodeInfo.Pods() { if err := processPod(existingPod); err != nil { pm.setError(err) } } } else { // The pod doesn't have any constraints - we need to check only existing // ones that have some. for _, existingPod := range nodeInfo.PodsWithAffinity() { if err := processPod(existingPod); err != nil { pm.setError(err) } } } } workqueue.Parallelize(16, len(allNodeNames), processNode) if pm.firstError != nil { return nil, pm.firstError } for _, node := range nodes { if pm.counts[node.Name] > maxCount { maxCount = pm.counts[node.Name] } if pm.counts[node.Name] < minCount { minCount = pm.counts[node.Name] } } // calculate final priority score for each node result := make(schedulerapi.HostPriorityList, 0, len(nodes)) for _, node := range nodes { fScore := float64(0) if (maxCount - minCount) > 0 { fScore = 10 * ((pm.counts[node.Name] - minCount) / (maxCount - minCount)) } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.V(10).Infof("%v -> %v: InterPodAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore)) } } return result, nil }